From 712eb2a11825e8d36f2870deb12b35486bd633fb Mon Sep 17 00:00:00 2001
From: Kendell Clement <k.clement.dev@gmail.com>
Date: Thu, 4 May 2023 16:40:07 -0400
Subject: [PATCH] Allow dashes in filenames resolve #73

---
 CRISPResso2/CRISPRessoPooledCORE.py | 11 ++---------
 CRISPResso2/CRISPRessoShared.py     | 16 +++++++---------
 CRISPResso2/CRISPRessoWGSCORE.py    | 13 ++-----------
 3 files changed, 11 insertions(+), 29 deletions(-)

diff --git a/CRISPResso2/CRISPRessoPooledCORE.py b/CRISPResso2/CRISPRessoPooledCORE.py
index 6a79abd4..4bcee22e 100644
--- a/CRISPResso2/CRISPRessoPooledCORE.py
+++ b/CRISPResso2/CRISPRessoPooledCORE.py
@@ -162,13 +162,6 @@ def get_n_aligned_bam_region(bam_filename, chr_name, chr_start, chr_end):
     p = sb.Popen("samtools view -F 0x904 -c %s %s:%d-%d" %(bam_filename, chr_name, chr_start, chr_end), shell=True, stdout=sb.PIPE)
     return int(p.communicate()[0])
 
-#get a clean name that we can use for a filename
-validFilenameChars = "+-_.() %s%s" % (string.ascii_letters, string.digits)
-
-def clean_filename(filename):
-    cleanedFilename = unicodedata.normalize('NFKD', filename)
-    return ''.join(c for c in cleanedFilename if c in validFilenameChars)
-
 def find_overlapping_genes(row, df_genes):
     df_genes_overlapping=df_genes.loc[(df_genes.chrom==row.chr_id) &
                                      (df_genes.txStart<=row.bpend) &
@@ -752,10 +745,10 @@ def main():
             with open(amplicon_fa_filename, 'w+') as outfile:
                 for idx, row in df_template.iterrows():
                     if row['amplicon_seq']:
-                        outfile.write('>%s\n%s\n' %(clean_filename('AMPL_'+idx), row['amplicon_seq']))
+                        outfile.write('>%s\n%s\n' %(CRISPRessoShared.clean_filename('AMPL_'+idx), row['amplicon_seq']))
 
                         #create place-holder fastq files
-                        fastq_gz_amplicon_filenames.append(_jp('%s.fastq.gz' % clean_filename('AMPL_'+idx)))
+                        fastq_gz_amplicon_filenames.append(_jp('%s.fastq.gz' % CRISPRessoShared.clean_filename('AMPL_'+idx)))
                         open(fastq_gz_amplicon_filenames[-1], 'w+').close()
 
             df_template['Demultiplexed_fastq.gz_filename']=fastq_gz_amplicon_filenames
diff --git a/CRISPResso2/CRISPRessoShared.py b/CRISPResso2/CRISPRessoShared.py
index 3608695d..f17326b7 100644
--- a/CRISPResso2/CRISPRessoShared.py
+++ b/CRISPResso2/CRISPRessoShared.py
@@ -449,12 +449,13 @@ def capitalize_sequence(x):
     return str(x).upper() if not pd.isnull(x) else x
 
 
-def slugify(value):  # adapted from the Django project
-
+def slugify(value):
+    print('slugify incoming strinG: ' + str(value))
     value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
-    value = re.sub(rb'[^\w\s-]', b'_', value).strip()
-    value = re.sub(rb'[-\s]+', b'-', value)
+    value = re.sub(rb'[\s\'*"/\\\[\]:;|,<>?]', b'_', value).strip()
+    value = re.sub(rb'_{2,}', b'_', value)
 
+    print('slugify outgoing string: ' + str(value.decode('utf-8')))
     return value.decode('utf-8')
 
 
@@ -516,13 +517,10 @@ def get_ref_length_from_cigar(cigar_string):
 
 def clean_filename(filename):
     # get a clean name that we can use for a filename
-    # validFilenameChars = "+-_.() %s%s" % (string.ascii_letters, string.digits)
+    validFilenameChars = "+-_.()%s%s" % (string.ascii_letters, string.digits)
     filename = str(filename).replace(' ', '_')
-    validFilenameChars = "_.%s%s" % (string.ascii_letters, string.digits)
-
     cleanedFilename = unicodedata.normalize('NFKD', filename)
-    return ''.join(c for c in cleanedFilename if c in validFilenameChars)
-
+    return(''.join(c for c in cleanedFilename if c in validFilenameChars))
 
 def check_file(filename):
     try:
diff --git a/CRISPResso2/CRISPRessoWGSCORE.py b/CRISPResso2/CRISPRessoWGSCORE.py
index 6b8a1b1f..1ff2ec33 100644
--- a/CRISPResso2/CRISPRessoWGSCORE.py
+++ b/CRISPResso2/CRISPRessoWGSCORE.py
@@ -109,15 +109,6 @@ def get_region_from_fa(chr_id, bpstart, bpend, uncompressed_reference):
     p = sb.Popen("samtools faidx %s %s |   grep -v ^\> | tr -d '\n'" %(uncompressed_reference, region), shell=True, stdout=sb.PIPE)
     return p.communicate()[0].decode('utf-8').upper()
 
-
-#get a clean name that we can use for a filename
-validFilenameChars = "+-_.() %s%s" % (string.ascii_letters, string.digits)
-
-def clean_filename(filename):
-    cleanedFilename = unicodedata.normalize('NFKD', filename)
-    return ''.join(c for c in cleanedFilename if c in validFilenameChars)
-
-
 def find_overlapping_genes(row, df_genes):
     df_genes_overlapping=df_genes.loc[(df_genes.chrom==row.chr_id) &
                                      (df_genes.txStart<=row.bpend) &
@@ -554,8 +545,8 @@ def rreplace(s, old, new):
 
         def set_filenames(row):
             row_fastq_exists = False
-            fastq_gz_filename=os.path.join(ANALYZED_REGIONS, '%s.fastq.gz' % clean_filename('REGION_'+str(row.region_number)))
-            bam_region_filename=os.path.join(ANALYZED_REGIONS, '%s.bam' % clean_filename('REGION_'+str(row.region_number)))
+            fastq_gz_filename=os.path.join(ANALYZED_REGIONS, '%s.fastq.gz' % CRISPRessoShared.clean_filename('REGION_'+str(row.region_number)))
+            bam_region_filename=os.path.join(ANALYZED_REGIONS, '%s.bam' % CRISPRessoShared.clean_filename('REGION_'+str(row.region_number)))
             #if bam file already exists, don't regenerate it
             if os.path.isfile(fastq_gz_filename):
                 row_fastq_exists = True