InPreD · tinavisnovska · Mar 6, 2024 · Feb 7, 2024 · Feb 8, 2024 · Feb 14, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -1,12 +1,7 @@
 FROM python:3.11.4-slim
 ENV PATH=$PATH:/opt
-RUN apt-get update \
-    && apt-get install -y --no-install-recommends \
-    && rm -rf /var/lib/apt/lists/* \
-    && apt-get clean
 COPY requirements.txt /
 RUN pip install --no-cache-dir -r requirements.txt \
     && rm requirements.txt
 COPY samplesheet_generator.py /opt/
-COPY test /opt/test
 COPY indexes /opt/indexes
diff --git a/README.md b/README.md
@@ -3,9 +3,10 @@ generates samplesheet compatible with TSO500 LocalApp analysis.
 
 ## Contents
 
-1. [Dependencies](#dependencies)
-2. [Description of Input Parameters](#description-of-input-parameters)
-3. [Usage](#usage)
+1. [Introduction](#introduction)
+2. [Dependencies](#dependencies)
+3. [Description of Input Parameters](#description-of-input-parameters)
+4. [Usage](#usage)
 
 ## Introduction
 
@@ -131,47 +132,17 @@ apptainer run \
 
 ### Run Test Data Example
 
-The script is tested with data of a specific sequencing run. The run consists of artificial samples, including AcroMetrix samples. The sequencing was performed on nextseq, with the legacy parameter setting and file formats.
+Test data are located in the `test` subfolder of the repository. Input info file is named `infoFile.tsv` and expected output in `samplesheet.tsv`.
 
-#### Test Data Input File
+The script is tested with data of a specific sequencing run. The run consists of artificial samples, including AcroMetrix samples. The sequencing was performed on a NextSeq instrument, with the legacy parameter setting and file formats.
 
-The input info file `infoFile.tsv` is located in `test`folder of this repository and in `/opt/test` of the created Docekr image. The content of the file follows:
-
-
-
-```
-sample_id	molecule	run_id	barcode	index
-CLAcroMetrix-D01-X01-X00	DNA	191206_NB501498_0174_AHWCNMBGXC	NA	TCCGGAGA
-```
-
-#### Test Data Output 
-
-```
-[Header]
-Investigator Name,Name (InPreD node)
-Experiment Name,OUS pathology test run
-Date,07/02/2024
-
-[Reads]
-101
-101
-
-[Settings]
-AdapterRead1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA
-AdapterRead2,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
-AdapterBehavior,trim
-MinimumTrimmedReadLength,35
-MaskShortReads,22
-OverrideCycles,U7N1Y93;I8;I8;U7N1Y93
-
-[Data]
-Sample_ID,Sample_Type,Pair_ID,index,I7_Index_ID,index2,I5_Index_ID
-CLAcroMetrix-D01-X01-X00,DNA,CLAcroMetrix-D01-X01-X00,TCCGGAGA,D702,AGGATAGG,D503
-```
 
 #### Locally
 
 ```
+# ${GITHUB_REPOSITORY_LOCAL_PATH} is an absolute path 
+# to the samplesheet_generator repository on on the local compute.
+
 # define the testRunID value
 testRunID="191206_NB501498_0174_AHWCNMBGXC"
 

diff --git a/indexes/TSO500_NextSeq_simple_indexes_legacy.tsv b/indexes/TSO500_NextSeq_simple_indexes_legacy.tsv
@@ -15,21 +15,3 @@ UP13	ACTGCTTA	D716	AGAGGCGC	D511
 UP14	ATGCGGCT	D714	TAGCCGCG	D512
 UP15	GCCTCTCT	D718	TTCGTAGG	D514
 UP16	GCCGTAGG	D719	CGCTCCGC	D516
-#
-#Test_Sample_UP01,,,,UP01,TCCGGAGA,D702,CCTATCCT,D503
-#Test_Sample_UP02,,,,UP02,CTGAAGCT,D707,GGCTCTGA,D504
-#Test_Sample_UP03,,,,UP03,CGTAGCTC,D717,TTCGGATG,D509
-#Test_Sample_UP04,,,,UP04,GAATTCGT,D706,ACTCATAA,D510
-#Test_Sample_UP05,,,,UP05,AGCGATAG,D712,TTATTCGT,D513
-#Test_Sample_UP06,,,,UP06,GCGATTAA,D724,AGCAGATC,D515
-#Test_Sample_UP07,,,,UP07,ATTCAGAA,D705,TATAGCCT,D501
-#Test_Sample_UP08,,,,UP08,GAATAATC,D713,ATAGAGGC,D502
-#Test_Sample_UP09,,,,UP09,TTAATCAG,D715,AGGCGAAG,D505
-#Test_Sample_UP10,,,,UP10,CGCTCATT,D703,TAATCTTA,D506
-#Test_Sample_UP11,,,,UP11,TCCGCGAA,D710,TACTTACT,D517
-#Test_Sample_UP12,,,,UP12,ATTACTCG,D701,AGGAAGTC,D518
-#Test_Sample_UP13,,,,UP13,ACTGCTTA,D716,GCGCCTCT,D511
-#Test_Sample_UP14,,,,UP14,ATGCGGCT,D714,CGCGGCTA,D512
-#Test_Sample_UP15,,,,UP15,GCCTCTCT,D718,CCTACGAA,D514
-#Test_Sample_UP16,,,,UP16,GCCGTAGG,D719,GCGGAGCG,D516
-#https://support.illumina.com/downloads/tso500-ctdna-sample-sheet-template.html
diff --git a/indexes/TSO500_NovaSeq_dual_indexes.tsv b/indexes/TSO500_NovaSeq_dual_indexes.tsv
@@ -1,4 +1,3 @@
-# grep -A 1000 "Sample_ID,Sample_Name,Sample_Plate" TSO500_NextSeq_SampleSheetTemplate_UDP.csv | awk 'BEGIN {FS=","; OFS="\t"} ; {print $5,$6,$7}' > TSO500_NextSeq_SampleSheetTemplate_indexes.tsv
 Index_ID	index	index2
 UDP0001	GAACTGAGCG	CGCTCCACGA
 UDP0002	AGGTCAGATA	TATCTTGTAG

diff --git a/samplesheet_generator.py b/samplesheet_generator.py
@@ -266,7 +266,6 @@ def print_data_section_dual_indexes_v1(info):
 
 
 
-
 def update_info_simple_indexes(info, sample_id, index, I7_Index_ID, index2, I5_Index_ID, molecule):
 # Store the sample info (index, I7_Index_ID, index2, I5_Index_ID, molecule) into the info dictionary,
 # if there is no other record present for info[sample_id].
@@ -416,20 +415,20 @@ def main():
 	# read the input parameters
 	parser=argparse.ArgumentParser(description='Generate SampleSheet for TSO500 LocalApp analysis for a given sequencing run, index_type and index_length.')
 	parser.add_argument('-r', '--run-id', help='ID string of the sequencing run for which a samplesheet should be generated.', required=True, type=str)
-	parser.add_argument('-t', '--index-type', help='Type of indexes, allowed values are \'dual\' and \'simple\'.', required=True, type=str)
-	parser.add_argument('-x', '--index-length', help='Index sequence length. Supported lengths 8 and 10 for dual indexes and 8 for simple indexes.', required=True, type=int)
+	parser.add_argument('-t', '--index-type', help='Type of indexes, allowed values are \'dual\' and \'simple\'.', required=True, type=str, choices=['dual', 'simple'])
+	parser.add_argument('-x', '--index-length', help='Index sequence length. Supported lengths 8 and 10 for dual indexes and 8 for simple indexes.', required=True, type=int, choices=[8, 10])
 	parser.add_argument('-n', '--investigator-name', help='Investigator name to be passed into samplesheet header, cannot contain a comma. [Default: \'\']', default='', type=str)
 	parser.add_argument('-e', '--experiment-name', help='Experiment name to be passed into samplesheet header, cannot contain a comma. [Default: \'\']', default='', type=str)
 	parser.add_argument('-i', '--input-info-file', help='Tab separated file with info about the samples that should be processed in one go.', required=True, type=str)
 	parser.add_argument('-1', '--read-length-1', help='Length of sequenced forward reads. [Default: \'101\']', default='101', type=str)
 	parser.add_argument('-2', '--read-length-2', help='Length of sequenced reverse reads. [Default: \'101\']', default='101', type=str)
 	parser.add_argument('-3', '--adapter-read-1', help='Sequence of read 1 adapter, will be used by BCL convert. [Default=\'\']', required=True, type=str)
 	parser.add_argument('-4', '--adapter-read-2', help='Sequence of read 2 adapter, will be used by BCL convert. [Default=\'\']', required=True, type=str)
-	parser.add_argument('-b', '--adapter-behavior', help='Setting AdapterBehavior value that will be used by BCL convert. [Default=\'trim\']', default='trim', type=str)
+	parser.add_argument('-b', '--adapter-behavior', help='Setting AdapterBehavior value that will be used by BCL convert. BCL convert supports values \'trim\' and \'mask\' [Default=\'trim\']', default='trim', type=str, choices=['trim', 'mask'])
 	parser.add_argument('-l', '--minimum-trimmed-read-length', help='Setting MinimumTrimmedReadLength value that will be used by BCL convert. [Default=35]', default=35, type=int)
 	parser.add_argument('-m', '--mask-short-reads', help='Setting MaskShortReads value that will be used by BCL convert. [Default=22]', default=22, type=int)
 	parser.add_argument('-o', '--override-cycles', help='Setting OverrideCycles value that will be used by BCL convert. [Default=\'\']', required=True, type=str)
-	parser.add_argument('-s', '--samplesheet-version', help='Specify sample sheet version. [Default=\'v1\']', default='v1', type=str)
+	parser.add_argument('-s', '--samplesheet-version', help='Specify sample sheet version. [Default=\'v1\']', default='v1', type=str, choices=['v1', 'v2'])
 
 	args=parser.parse_args()
 
@@ -455,16 +454,11 @@ def main():
 
 	if   (index_type.lower() == "dual"):
 		dual_indexes = True
-	elif (index_type.lower() == "simple"):
+	else (index_type.lower() == "simple"):
 		dual_indexes = False
-	else:
-		sys.exit("Unsupported value of index_type=\'"+args.index_type+"\' provided.")	
 
-
-	if not (index_length == 8) and not (index_length == 10):
-		sys.exit("Unsupported value of index_length=\'"+index_length+"\' provided.")
-	elif not dual_indexes and (index_length == 10):
-		sys.exit("Unsupported value of index_length=\'"+index_length+"\' for simple indexes provided.")
+	if not dual_indexes and (index_length == 10):
+		sys.exit("Unsupported value of index_length=\'"+index_length+"\' for simple indexes.")
 
 	indexes=assign_indexes(dual_indexes, index_length)
 
@@ -477,10 +471,8 @@ def main():
 		print_reads_section_v1(read_length_1, read_length_2)
 		print_settings_section_v1(adapter_read_1, adapter_read_2, adapter_behavior, minimum_trimmed_read_length, mask_short_reads, override_cycles)
 		print_data_section_v1(run_id, dual_indexes, index_length, indexes, input_info_file)
-	elif (samplesheet_version == 'v2'):
+	else (samplesheet_version == 'v2'):
 		sys.exit("Not implemented yet")
-	else:
-		sys.exit("Samplesheet version "+samplesheet_version+" is not supported at the moment")
 
 
 if __name__ == "__main__":

diff --git a/test/samplesheet.tsv b/test/samplesheet.tsv
@@ -0,0 +1,20 @@
+[Header]
+Investigator Name,Name (InPreD node)
+Experiment Name,OUS pathology test run
+Date,07/02/2024
+
+[Reads]
+101
+101
+
+[Settings]
+AdapterRead1,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA
+AdapterRead2,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
+AdapterBehavior,trim
+MinimumTrimmedReadLength,35
+MaskShortReads,22
+OverrideCycles,U7N1Y93;I8;I8;U7N1Y93
+
+[Data]
+Sample_ID,Sample_Type,Pair_ID,index,I7_Index_ID,index2,I5_Index_ID
+CLAcroMetrix-D01-X01-X00,DNA,CLAcroMetrix-D01-X01-X00,TCCGGAGA,D702,AGGATAGG,D503