From 712eb2a11825e8d36f2870deb12b35486bd633fb Mon Sep 17 00:00:00 2001 From: Kendell Clement Date: Thu, 4 May 2023 16:40:07 -0400 Subject: [PATCH] Allow dashes in filenames resolve #73 --- CRISPResso2/CRISPRessoPooledCORE.py | 11 ++--------- CRISPResso2/CRISPRessoShared.py | 16 +++++++--------- CRISPResso2/CRISPRessoWGSCORE.py | 13 ++----------- 3 files changed, 11 insertions(+), 29 deletions(-) diff --git a/CRISPResso2/CRISPRessoPooledCORE.py b/CRISPResso2/CRISPRessoPooledCORE.py index 6a79abd4..4bcee22e 100644 --- a/CRISPResso2/CRISPRessoPooledCORE.py +++ b/CRISPResso2/CRISPRessoPooledCORE.py @@ -162,13 +162,6 @@ def get_n_aligned_bam_region(bam_filename, chr_name, chr_start, chr_end): p = sb.Popen("samtools view -F 0x904 -c %s %s:%d-%d" %(bam_filename, chr_name, chr_start, chr_end), shell=True, stdout=sb.PIPE) return int(p.communicate()[0]) -#get a clean name that we can use for a filename -validFilenameChars = "+-_.() %s%s" % (string.ascii_letters, string.digits) - -def clean_filename(filename): - cleanedFilename = unicodedata.normalize('NFKD', filename) - return ''.join(c for c in cleanedFilename if c in validFilenameChars) - def find_overlapping_genes(row, df_genes): df_genes_overlapping=df_genes.loc[(df_genes.chrom==row.chr_id) & (df_genes.txStart<=row.bpend) & @@ -752,10 +745,10 @@ def main(): with open(amplicon_fa_filename, 'w+') as outfile: for idx, row in df_template.iterrows(): if row['amplicon_seq']: - outfile.write('>%s\n%s\n' %(clean_filename('AMPL_'+idx), row['amplicon_seq'])) + outfile.write('>%s\n%s\n' %(CRISPRessoShared.clean_filename('AMPL_'+idx), row['amplicon_seq'])) #create place-holder fastq files - fastq_gz_amplicon_filenames.append(_jp('%s.fastq.gz' % clean_filename('AMPL_'+idx))) + fastq_gz_amplicon_filenames.append(_jp('%s.fastq.gz' % CRISPRessoShared.clean_filename('AMPL_'+idx))) open(fastq_gz_amplicon_filenames[-1], 'w+').close() df_template['Demultiplexed_fastq.gz_filename']=fastq_gz_amplicon_filenames diff --git a/CRISPResso2/CRISPRessoShared.py b/CRISPResso2/CRISPRessoShared.py index 3608695d..f17326b7 100644 --- a/CRISPResso2/CRISPRessoShared.py +++ b/CRISPResso2/CRISPRessoShared.py @@ -449,12 +449,13 @@ def capitalize_sequence(x): return str(x).upper() if not pd.isnull(x) else x -def slugify(value): # adapted from the Django project - +def slugify(value): + print('slugify incoming strinG: ' + str(value)) value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore') - value = re.sub(rb'[^\w\s-]', b'_', value).strip() - value = re.sub(rb'[-\s]+', b'-', value) + value = re.sub(rb'[\s\'*"/\\\[\]:;|,<>?]', b'_', value).strip() + value = re.sub(rb'_{2,}', b'_', value) + print('slugify outgoing string: ' + str(value.decode('utf-8'))) return value.decode('utf-8') @@ -516,13 +517,10 @@ def get_ref_length_from_cigar(cigar_string): def clean_filename(filename): # get a clean name that we can use for a filename - # validFilenameChars = "+-_.() %s%s" % (string.ascii_letters, string.digits) + validFilenameChars = "+-_.()%s%s" % (string.ascii_letters, string.digits) filename = str(filename).replace(' ', '_') - validFilenameChars = "_.%s%s" % (string.ascii_letters, string.digits) - cleanedFilename = unicodedata.normalize('NFKD', filename) - return ''.join(c for c in cleanedFilename if c in validFilenameChars) - + return(''.join(c for c in cleanedFilename if c in validFilenameChars)) def check_file(filename): try: diff --git a/CRISPResso2/CRISPRessoWGSCORE.py b/CRISPResso2/CRISPRessoWGSCORE.py index 6b8a1b1f..1ff2ec33 100644 --- a/CRISPResso2/CRISPRessoWGSCORE.py +++ b/CRISPResso2/CRISPRessoWGSCORE.py @@ -109,15 +109,6 @@ def get_region_from_fa(chr_id, bpstart, bpend, uncompressed_reference): p = sb.Popen("samtools faidx %s %s | grep -v ^\> | tr -d '\n'" %(uncompressed_reference, region), shell=True, stdout=sb.PIPE) return p.communicate()[0].decode('utf-8').upper() - -#get a clean name that we can use for a filename -validFilenameChars = "+-_.() %s%s" % (string.ascii_letters, string.digits) - -def clean_filename(filename): - cleanedFilename = unicodedata.normalize('NFKD', filename) - return ''.join(c for c in cleanedFilename if c in validFilenameChars) - - def find_overlapping_genes(row, df_genes): df_genes_overlapping=df_genes.loc[(df_genes.chrom==row.chr_id) & (df_genes.txStart<=row.bpend) & @@ -554,8 +545,8 @@ def rreplace(s, old, new): def set_filenames(row): row_fastq_exists = False - fastq_gz_filename=os.path.join(ANALYZED_REGIONS, '%s.fastq.gz' % clean_filename('REGION_'+str(row.region_number))) - bam_region_filename=os.path.join(ANALYZED_REGIONS, '%s.bam' % clean_filename('REGION_'+str(row.region_number))) + fastq_gz_filename=os.path.join(ANALYZED_REGIONS, '%s.fastq.gz' % CRISPRessoShared.clean_filename('REGION_'+str(row.region_number))) + bam_region_filename=os.path.join(ANALYZED_REGIONS, '%s.bam' % CRISPRessoShared.clean_filename('REGION_'+str(row.region_number))) #if bam file already exists, don't regenerate it if os.path.isfile(fastq_gz_filename): row_fastq_exists = True