MCPcopy
hub / github.com/openai/plugins / normalize_samples

Function normalize_samples

plugins/ngs-analysis/scripts/run_fastq_assay_package.py:333–437  ·  view source on GitHub ↗
(
    args: argparse.Namespace,
)

Source from the content-addressed store, hash-verified

331
332
333def normalize_samples(
334 args: argparse.Namespace,
335) -> tuple[dict[str, Any], list[dict[str, str]], list[Path]]:
336 sample_sheet = args.sample_sheet.expanduser().resolve()
337 rows, columns = read_table(sample_sheet)
338 roots = [root.expanduser().resolve() for root in args.fastq_root]
339 roots.extend([sample_sheet.parent, Path.cwd()])
340 normalized: list[dict[str, str]] = []
341 fastq_paths: list[Path] = []
342 errors: list[str] = []
343 warnings: list[str] = []
344 fastq_checks = []
345
346 for row_index, row in enumerate(rows, start=2):
347 sample = (
348 first_present(row, ["sample", "sample_id", "sampleID", "run_accession"])
349 or f"row_{row_index}"
350 )
351 r1_raw = first_present(row, ["fastq_1", "forwardReads", "r1", "read1"])
352 r2_raw = first_present(row, ["fastq_2", "reverseReads", "r2", "read2"])
353 fasta_raw = first_present(row, ["fasta"])
354 if not r1_raw and not fasta_raw:
355 errors.append(f"row {row_index}: fastq_1/forwardReads or fasta is required")
356 continue
357 r1 = resolve_existing_path(r1_raw, sample_sheet.parent, roots) if r1_raw else None
358 r2 = resolve_existing_path(r2_raw, sample_sheet.parent, roots) if r2_raw else None
359 fasta = resolve_existing_path(fasta_raw, sample_sheet.parent, roots) if fasta_raw else None
360 if r1_raw and not r1:
361 errors.append(f"row {row_index}: could not resolve read 1 path {r1_raw}")
362 if r2_raw and not r2:
363 errors.append(f"row {row_index}: could not resolve read 2 path {r2_raw}")
364 if fasta_raw and not fasta:
365 errors.append(f"row {row_index}: could not resolve fasta path {fasta_raw}")
366 for read_label, read_path in [("r1", r1), ("r2", r2)]:
367 if read_path is None:
368 continue
369 fastq_paths.append(read_path)
370 check = check_fastq(read_path, args.fastq_record_check)
371 check["sample"] = sample
372 check["read"] = read_label
373 fastq_checks.append(check)
374 if check["errors"]:
375 errors.extend(f"{sample} {read_label}: {error}" for error in check["errors"])
376 normalized.append(
377 {
378 "sample": sample,
379 "row_index": str(row_index),
380 "fastq_1": str(r1) if r1 else "",
381 "fastq_2": str(r2) if r2 else "",
382 "fasta": str(fasta) if fasta else "",
383 "layout": "PE" if r2 else ("SE" if r1 else "FASTA"),
384 "marker": first_present(row, ["marker", "target", "region"]),
385 "assay": first_present(row, ["assay", "library_strategy"]) or args.lane,
386 "instrument_platform": first_present(row, ["instrument_platform", "platform"]),
387 "host_organism": first_present(
388 row, ["host_organism", "host", "host_species", "organism"]
389 ),
390 "genome_build": first_present(

Callers 1

mainFunction · 0.85

Calls 7

metadata_warningsFunction · 0.85
summarize_input_contextFunction · 0.85
extendMethod · 0.80
read_tableFunction · 0.70
first_presentFunction · 0.70
resolve_existing_pathFunction · 0.70
check_fastqFunction · 0.70

Tested by

no test coverage detected