Skip to content

Commit

Permalink
Allow dashes in filenames resolve #73
Browse files Browse the repository at this point in the history
  • Loading branch information
kclem committed May 4, 2023
1 parent a439f09 commit 712eb2a
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 29 deletions.
11 changes: 2 additions & 9 deletions CRISPResso2/CRISPRessoPooledCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,13 +162,6 @@ def get_n_aligned_bam_region(bam_filename, chr_name, chr_start, chr_end):
p = sb.Popen("samtools view -F 0x904 -c %s %s:%d-%d" %(bam_filename, chr_name, chr_start, chr_end), shell=True, stdout=sb.PIPE)
return int(p.communicate()[0])

#get a clean name that we can use for a filename
validFilenameChars = "+-_.() %s%s" % (string.ascii_letters, string.digits)

def clean_filename(filename):
cleanedFilename = unicodedata.normalize('NFKD', filename)
return ''.join(c for c in cleanedFilename if c in validFilenameChars)

def find_overlapping_genes(row, df_genes):
df_genes_overlapping=df_genes.loc[(df_genes.chrom==row.chr_id) &
(df_genes.txStart<=row.bpend) &
Expand Down Expand Up @@ -752,10 +745,10 @@ def main():
with open(amplicon_fa_filename, 'w+') as outfile:
for idx, row in df_template.iterrows():
if row['amplicon_seq']:
outfile.write('>%s\n%s\n' %(clean_filename('AMPL_'+idx), row['amplicon_seq']))
outfile.write('>%s\n%s\n' %(CRISPRessoShared.clean_filename('AMPL_'+idx), row['amplicon_seq']))

#create place-holder fastq files
fastq_gz_amplicon_filenames.append(_jp('%s.fastq.gz' % clean_filename('AMPL_'+idx)))
fastq_gz_amplicon_filenames.append(_jp('%s.fastq.gz' % CRISPRessoShared.clean_filename('AMPL_'+idx)))
open(fastq_gz_amplicon_filenames[-1], 'w+').close()

df_template['Demultiplexed_fastq.gz_filename']=fastq_gz_amplicon_filenames
Expand Down
16 changes: 7 additions & 9 deletions CRISPResso2/CRISPRessoShared.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,12 +449,13 @@ def capitalize_sequence(x):
return str(x).upper() if not pd.isnull(x) else x


def slugify(value): # adapted from the Django project

def slugify(value):
print('slugify incoming strinG: ' + str(value))
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
value = re.sub(rb'[^\w\s-]', b'_', value).strip()
value = re.sub(rb'[-\s]+', b'-', value)
value = re.sub(rb'[\s\'*"/\\\[\]:;|,<>?]', b'_', value).strip()
value = re.sub(rb'_{2,}', b'_', value)

print('slugify outgoing string: ' + str(value.decode('utf-8')))
return value.decode('utf-8')


Expand Down Expand Up @@ -516,13 +517,10 @@ def get_ref_length_from_cigar(cigar_string):

def clean_filename(filename):
# get a clean name that we can use for a filename
# validFilenameChars = "+-_.() %s%s" % (string.ascii_letters, string.digits)
validFilenameChars = "+-_.()%s%s" % (string.ascii_letters, string.digits)
filename = str(filename).replace(' ', '_')
validFilenameChars = "_.%s%s" % (string.ascii_letters, string.digits)

cleanedFilename = unicodedata.normalize('NFKD', filename)
return ''.join(c for c in cleanedFilename if c in validFilenameChars)

return(''.join(c for c in cleanedFilename if c in validFilenameChars))

def check_file(filename):
try:
Expand Down
13 changes: 2 additions & 11 deletions CRISPResso2/CRISPRessoWGSCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,15 +109,6 @@ def get_region_from_fa(chr_id, bpstart, bpend, uncompressed_reference):
p = sb.Popen("samtools faidx %s %s | grep -v ^\> | tr -d '\n'" %(uncompressed_reference, region), shell=True, stdout=sb.PIPE)
return p.communicate()[0].decode('utf-8').upper()


#get a clean name that we can use for a filename
validFilenameChars = "+-_.() %s%s" % (string.ascii_letters, string.digits)

def clean_filename(filename):
cleanedFilename = unicodedata.normalize('NFKD', filename)
return ''.join(c for c in cleanedFilename if c in validFilenameChars)


def find_overlapping_genes(row, df_genes):
df_genes_overlapping=df_genes.loc[(df_genes.chrom==row.chr_id) &
(df_genes.txStart<=row.bpend) &
Expand Down Expand Up @@ -554,8 +545,8 @@ def rreplace(s, old, new):

def set_filenames(row):
row_fastq_exists = False
fastq_gz_filename=os.path.join(ANALYZED_REGIONS, '%s.fastq.gz' % clean_filename('REGION_'+str(row.region_number)))
bam_region_filename=os.path.join(ANALYZED_REGIONS, '%s.bam' % clean_filename('REGION_'+str(row.region_number)))
fastq_gz_filename=os.path.join(ANALYZED_REGIONS, '%s.fastq.gz' % CRISPRessoShared.clean_filename('REGION_'+str(row.region_number)))
bam_region_filename=os.path.join(ANALYZED_REGIONS, '%s.bam' % CRISPRessoShared.clean_filename('REGION_'+str(row.region_number)))
#if bam file already exists, don't regenerate it
if os.path.isfile(fastq_gz_filename):
row_fastq_exists = True
Expand Down

0 comments on commit 712eb2a

Please sign in to comment.