Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix to allow hashes in read IDs in cload pairs #193

Merged
merged 4 commits into from
Apr 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 41 additions & 2 deletions cooler/cli/cload.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,44 @@
)


def get_header(instream, comment_char='#'): # Copied from pairtools._headerops
'''Returns a header from the stream and an the reaminder of the stream
with the actual data.
Parameters
----------
instream : a file object
An input stream.
comment_char : str
The character prepended to header lines (use '@' when parsing sams,
'#' when parsing pairsams).
Returns
-------
header : list
The header lines, stripped of terminal spaces and newline characters.
remainder_stream : stream/file-like object
Stream with the remaining lines.

'''
header = []
if not comment_char:
raise ValueError('Please, provide a comment char!')
comment_byte = comment_char.encode()
# get peekable buffer for the instream
inbuffer = instream.buffer
current_peek = inbuffer.peek()
while current_peek.startswith(comment_byte):
# consuming a line from buffer guarantees
# that the remainder of the buffer starts
# with the beginning of the line.
line = inbuffer.readline()
# append line to header, since it does start with header
header.append(line.decode().strip())
# peek into the remainder of the instream
current_peek = inbuffer.peek()
# apparently, next line does not start with the comment
# return header and the instream, advanced to the beginning of the data
return header, instream

@cli.group()
def cload():
"""
Expand Down Expand Up @@ -485,15 +523,16 @@ def pairs(bins, pairs_path, cool_path, metadata, assembly, chunksize,
if pairs_path == '-':
f_in = sys.stdin
else:
f_in = pairs_path
f_in = pd.io.common.get_handle(pairs_path, mode='r',
compression='infer')[0]
f_in = get_header(f_in)[1] # We could save the header into metadata?

reader = pd.read_csv(
f_in,
sep='\t',
usecols=[input_field_numbers[name] for name in input_field_names],
names=input_field_names,
dtype=input_field_dtypes,
comment=comment_char,
iterator=True,
chunksize=chunksize)

Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
six
numpy>=1.9
scipy>=0.16
pandas>=0.19
pandas>=1.0
h5py>=2.5
click>=7
cytoolz
Expand All @@ -11,4 +11,4 @@ pyfaidx
pypairix
asciitree
pyyaml
simplejson
simplejson
192 changes: 96 additions & 96 deletions tests/data/toy.pairs
Original file line number Diff line number Diff line change
@@ -1,96 +1,96 @@
chr1 1 chr1 17 - + 0.1
chr1 2 chr1 18 - + 0.1
chr1 3 chr1 19 - + 0.1
chr1 4 chr1 20 - + 0.1
chr1 5 chr1 21 - + 0.1
chr1 6 chr1 22 - + 0.1
chr1 7 chr1 23 - + 0.1
chr1 8 chr1 24 - + 0.1
chr1 9 chr1 25 - + 0.1
chr1 10 chr1 26 - + 0.1
chr1 11 chr1 27 - + 0.1
chr1 12 chr1 28 - + 0.1
chr1 13 chr1 29 - + 0.1
chr1 14 chr1 30 - + 0.1
chr1 15 chr1 31 - + 0.1
chr1 16 chr1 32 - + 0.1
chr2 17 chr2 1 - + 0.1
chr2 18 chr2 2 - + 0.1
chr2 19 chr2 3 - + 0.1
chr2 20 chr2 4 - + 0.1
chr2 21 chr2 5 - + 0.1
chr2 22 chr2 6 - + 0.1
chr2 23 chr2 7 - + 0.1
chr2 24 chr2 8 - + 0.1
chr2 25 chr2 9 - + 0.1
chr2 26 chr2 10 - + 0.1
chr2 27 chr2 11 - + 0.1
chr2 28 chr2 12 - + 0.1
chr2 29 chr2 13 - + 0.1
chr2 30 chr2 14 - + 0.1
chr2 31 chr2 15 - + 0.1
chr2 32 chr2 16 - + 0.1
chr2 1 chr1 32 - + 0.1
chr2 2 chr1 31 - + 0.1
chr2 3 chr1 30 - + 0.1
chr2 4 chr1 29 - + 0.1
chr2 5 chr1 28 - + 0.1
chr2 6 chr1 27 - + 0.1
chr2 7 chr1 26 - + 0.1
chr2 8 chr1 25 - + 0.1
chr2 9 chr1 24 - + 0.1
chr2 10 chr1 23 - + 0.1
chr2 11 chr1 22 - + 0.1
chr2 12 chr1 21 - + 0.1
chr2 13 chr1 20 - + 0.1
chr2 14 chr1 19 - + 0.1
chr2 15 chr1 18 - + 0.1
chr2 16 chr1 17 - + 0.1
chr2 17 chr1 16 - + 0.1
chr2 18 chr1 15 - + 0.1
chr2 19 chr1 14 - + 0.1
chr2 20 chr1 13 - + 0.1
chr2 21 chr1 12 - + 0.1
chr2 22 chr1 11 - + 0.1
chr2 23 chr1 10 - + 0.1
chr2 24 chr1 9 - + 0.1
chr2 25 chr1 8 - + 0.1
chr2 26 chr1 7 - + 0.1
chr2 27 chr1 6 - + 0.1
chr2 28 chr1 5 - + 0.1
chr2 29 chr1 4 - + 0.1
chr2 30 chr1 3 - + 0.1
chr2 31 chr1 2 - + 0.1
chr2 32 chr1 1 - + 0.1
chr1 1 chr2 1 - + 0.1
chr1 2 chr2 2 - + 0.1
chr1 3 chr2 3 - + 0.1
chr1 4 chr2 4 - + 0.1
chr1 5 chr2 5 - + 0.1
chr1 6 chr2 6 - + 0.1
chr1 7 chr2 7 - + 0.1
chr1 8 chr2 8 - + 0.1
chr1 9 chr2 9 - + 0.1
chr1 10 chr2 10 - + 0.1
chr1 11 chr2 11 - + 0.1
chr1 12 chr2 12 - + 0.1
chr1 13 chr2 13 - + 0.1
chr1 14 chr2 14 - + 0.1
chr1 15 chr2 15 - + 0.1
chr1 16 chr2 16 - + 0.1
chr1 17 chr2 17 - + 0.1
chr1 18 chr2 18 - + 0.1
chr1 19 chr2 19 - + 0.1
chr1 20 chr2 20 - + 0.1
chr1 21 chr2 21 - + 0.1
chr1 22 chr2 22 - + 0.1
chr1 23 chr2 23 - + 0.1
chr1 24 chr2 24 - + 0.1
chr1 25 chr2 25 - + 0.1
chr1 26 chr2 26 - + 0.1
chr1 27 chr2 27 - + 0.1
chr1 28 chr2 28 - + 0.1
chr1 29 chr2 29 - + 0.1
chr1 30 chr2 30 - + 0.1
chr1 31 chr2 31 - + 0.1
chr1 32 chr2 32 - + 0.1
Read1 chr1 1 chr1 17 - + 0.1
Read2 chr1 2 chr1 18 - + 0.1
Read3 chr1 3 chr1 19 - + 0.1
Read4 chr1 4 chr1 20 - + 0.1
Read5 chr1 5 chr1 21 - + 0.1
Read6 chr1 6 chr1 22 - + 0.1
Read7 chr1 7 chr1 23 - + 0.1
Read8 chr1 8 chr1 24 - + 0.1
Read9 chr1 9 chr1 25 - + 0.1
Read10 chr1 10 chr1 26 - + 0.1
Read11 chr1 11 chr1 27 - + 0.1
Read12 chr1 12 chr1 28 - + 0.1
Read13 chr1 13 chr1 29 - + 0.1
Read14 chr1 14 chr1 30 - + 0.1
Read15 chr1 15 chr1 31 - + 0.1
Read16 chr1 16 chr1 32 - + 0.1
Read17 chr2 17 chr2 1 - + 0.1
Read18 chr2 18 chr2 2 - + 0.1
Read19 chr2 19 chr2 3 - + 0.1
Read20 chr2 20 chr2 4 - + 0.1
Read21 chr2 21 chr2 5 - + 0.1
Read22 chr2 22 chr2 6 - + 0.1
Read23 chr2 23 chr2 7 - + 0.1
Read24 chr2 24 chr2 8 - + 0.1
Read25 chr2 25 chr2 9 - + 0.1
Read26 chr2 26 chr2 10 - + 0.1
Read27 chr2 27 chr2 11 - + 0.1
Read28 chr2 28 chr2 12 - + 0.1
Read29 chr2 29 chr2 13 - + 0.1
Read30 chr2 30 chr2 14 - + 0.1
Read31 chr2 31 chr2 15 - + 0.1
Read32 chr2 32 chr2 16 - + 0.1
Read33 chr2 1 chr1 32 - + 0.1
Read34 chr2 2 chr1 31 - + 0.1
Read35 chr2 3 chr1 30 - + 0.1
Read36 chr2 4 chr1 29 - + 0.1
Read37 chr2 5 chr1 28 - + 0.1
Read38 chr2 6 chr1 27 - + 0.1
Read39 chr2 7 chr1 26 - + 0.1
Read40 chr2 8 chr1 25 - + 0.1
Read41 chr2 9 chr1 24 - + 0.1
Read42 chr2 10 chr1 23 - + 0.1
Read43 chr2 11 chr1 22 - + 0.1
Read44 chr2 12 chr1 21 - + 0.1
Read45 chr2 13 chr1 20 - + 0.1
Read46 chr2 14 chr1 19 - + 0.1
Read47 chr2 15 chr1 18 - + 0.1
Read48 chr2 16 chr1 17 - + 0.1
Read49 chr2 17 chr1 16 - + 0.1
Read50 chr2 18 chr1 15 - + 0.1
Read51 chr2 19 chr1 14 - + 0.1
Read52 chr2 20 chr1 13 - + 0.1
Read53 chr2 21 chr1 12 - + 0.1
Read54 chr2 22 chr1 11 - + 0.1
Read55 chr2 23 chr1 10 - + 0.1
Read56 chr2 24 chr1 9 - + 0.1
Read57 chr2 25 chr1 8 - + 0.1
Read58 chr2 26 chr1 7 - + 0.1
Read59 chr2 27 chr1 6 - + 0.1
Read60 chr2 28 chr1 5 - + 0.1
Read61 chr2 29 chr1 4 - + 0.1
Read62 chr2 30 chr1 3 - + 0.1
Read63 chr2 31 chr1 2 - + 0.1
Read64 chr2 32 chr1 1 - + 0.1
Read65 chr1 1 chr2 1 - + 0.1
Read66 chr1 2 chr2 2 - + 0.1
Read67 chr1 3 chr2 3 - + 0.1
Read68 chr1 4 chr2 4 - + 0.1
Read69 chr1 5 chr2 5 - + 0.1
Read70 chr1 6 chr2 6 - + 0.1
Read71 chr1 7 chr2 7 - + 0.1
Read72 chr1 8 chr2 8 - + 0.1
Read73 chr1 9 chr2 9 - + 0.1
Read74 chr1 10 chr2 10 - + 0.1
Read75 chr1 11 chr2 11 - + 0.1
Read76 chr1 12 chr2 12 - + 0.1
Read77 chr1 13 chr2 13 - + 0.1
Read78 chr1 14 chr2 14 - + 0.1
Read79 chr1 15 chr2 15 - + 0.1
Read80 chr1 16 chr2 16 - + 0.1
Read81 chr1 17 chr2 17 - + 0.1
Read82 chr1 18 chr2 18 - + 0.1
Read83 chr1 19 chr2 19 - + 0.1
Read84 chr1 20 chr2 20 - + 0.1
Read85 chr1 21 chr2 21 - + 0.1
Read86 chr1 22 chr2 22 - + 0.1
Read87 chr1 23 chr2 23 - + 0.1
Read88 chr1 24 chr2 24 - + 0.1
Read89 chr1 25 chr2 25 - + 0.1
Read90 chr1 26 chr2 26 - + 0.1
Read91 chr1 27 chr2 27 - + 0.1
Read92 chr1 28 chr2 28 - + 0.1
Read93 chr1 29 chr2 29 - + 0.1
Read94 chr1 30 chr2 30 - + 0.1
Read95 chr1 31 chr2 31 - + 0.1
Read96 chr1 32 chr2 32 - + 0.1
96 changes: 96 additions & 0 deletions tests/data/toy_hash.pairs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
Read#1 chr1 1 chr1 17 - + 0.1
Read#2 chr1 2 chr1 18 - + 0.1
Read#3 chr1 3 chr1 19 - + 0.1
Read#4 chr1 4 chr1 20 - + 0.1
Read#5 chr1 5 chr1 21 - + 0.1
Read#6 chr1 6 chr1 22 - + 0.1
Read#7 chr1 7 chr1 23 - + 0.1
Read#8 chr1 8 chr1 24 - + 0.1
Read#9 chr1 9 chr1 25 - + 0.1
Read#10 chr1 10 chr1 26 - + 0.1
Read#11 chr1 11 chr1 27 - + 0.1
Read#12 chr1 12 chr1 28 - + 0.1
Read#13 chr1 13 chr1 29 - + 0.1
Read#14 chr1 14 chr1 30 - + 0.1
Read#15 chr1 15 chr1 31 - + 0.1
Read#16 chr1 16 chr1 32 - + 0.1
Read#17 chr2 17 chr2 1 - + 0.1
Read#18 chr2 18 chr2 2 - + 0.1
Read#19 chr2 19 chr2 3 - + 0.1
Read#20 chr2 20 chr2 4 - + 0.1
Read#21 chr2 21 chr2 5 - + 0.1
Read#22 chr2 22 chr2 6 - + 0.1
Read#23 chr2 23 chr2 7 - + 0.1
Read#24 chr2 24 chr2 8 - + 0.1
Read#25 chr2 25 chr2 9 - + 0.1
Read#26 chr2 26 chr2 10 - + 0.1
Read#27 chr2 27 chr2 11 - + 0.1
Read#28 chr2 28 chr2 12 - + 0.1
Read#29 chr2 29 chr2 13 - + 0.1
Read#30 chr2 30 chr2 14 - + 0.1
Read#31 chr2 31 chr2 15 - + 0.1
Read#32 chr2 32 chr2 16 - + 0.1
Read#33 chr2 1 chr1 32 - + 0.1
Read#34 chr2 2 chr1 31 - + 0.1
Read#35 chr2 3 chr1 30 - + 0.1
Read#36 chr2 4 chr1 29 - + 0.1
Read#37 chr2 5 chr1 28 - + 0.1
Read#38 chr2 6 chr1 27 - + 0.1
Read#39 chr2 7 chr1 26 - + 0.1
Read#40 chr2 8 chr1 25 - + 0.1
Read#41 chr2 9 chr1 24 - + 0.1
Read#42 chr2 10 chr1 23 - + 0.1
Read#43 chr2 11 chr1 22 - + 0.1
Read#44 chr2 12 chr1 21 - + 0.1
Read#45 chr2 13 chr1 20 - + 0.1
Read#46 chr2 14 chr1 19 - + 0.1
Read#47 chr2 15 chr1 18 - + 0.1
Read#48 chr2 16 chr1 17 - + 0.1
Read#49 chr2 17 chr1 16 - + 0.1
Read#50 chr2 18 chr1 15 - + 0.1
Read#51 chr2 19 chr1 14 - + 0.1
Read#52 chr2 20 chr1 13 - + 0.1
Read#53 chr2 21 chr1 12 - + 0.1
Read#54 chr2 22 chr1 11 - + 0.1
Read#55 chr2 23 chr1 10 - + 0.1
Read#56 chr2 24 chr1 9 - + 0.1
Read#57 chr2 25 chr1 8 - + 0.1
Read#58 chr2 26 chr1 7 - + 0.1
Read#59 chr2 27 chr1 6 - + 0.1
Read#60 chr2 28 chr1 5 - + 0.1
Read#61 chr2 29 chr1 4 - + 0.1
Read#62 chr2 30 chr1 3 - + 0.1
Read#63 chr2 31 chr1 2 - + 0.1
Read#64 chr2 32 chr1 1 - + 0.1
Read#65 chr1 1 chr2 1 - + 0.1
Read#66 chr1 2 chr2 2 - + 0.1
Read#67 chr1 3 chr2 3 - + 0.1
Read#68 chr1 4 chr2 4 - + 0.1
Read#69 chr1 5 chr2 5 - + 0.1
Read#70 chr1 6 chr2 6 - + 0.1
Read#71 chr1 7 chr2 7 - + 0.1
Read#72 chr1 8 chr2 8 - + 0.1
Read#73 chr1 9 chr2 9 - + 0.1
Read#74 chr1 10 chr2 10 - + 0.1
Read#75 chr1 11 chr2 11 - + 0.1
Read#76 chr1 12 chr2 12 - + 0.1
Read#77 chr1 13 chr2 13 - + 0.1
Read#78 chr1 14 chr2 14 - + 0.1
Read#79 chr1 15 chr2 15 - + 0.1
Read#80 chr1 16 chr2 16 - + 0.1
Read#81 chr1 17 chr2 17 - + 0.1
Read#82 chr1 18 chr2 18 - + 0.1
Read#83 chr1 19 chr2 19 - + 0.1
Read#84 chr1 20 chr2 20 - + 0.1
Read#85 chr1 21 chr2 21 - + 0.1
Read#86 chr1 22 chr2 22 - + 0.1
Read#87 chr1 23 chr2 23 - + 0.1
Read#88 chr1 24 chr2 24 - + 0.1
Read#89 chr1 25 chr2 25 - + 0.1
Read#90 chr1 26 chr2 26 - + 0.1
Read#91 chr1 27 chr2 27 - + 0.1
Read#92 chr1 28 chr2 28 - + 0.1
Read#93 chr1 29 chr2 29 - + 0.1
Read#94 chr1 30 chr2 30 - + 0.1
Read#95 chr1 31 chr2 31 - + 0.1
Read#96 chr1 32 chr2 32 - + 0.1
Loading