Skip to content

Commit

Permalink
Fix to allow hashes in read IDs in cload pairs (#193)
Browse files Browse the repository at this point in the history
* Fix to allow hashes in read IDs in cload pairs

* Update tests for cload pairs

* Fix tests

* Copy get_header and remove pairtools dependency
  • Loading branch information
Phlya authored Apr 24, 2020
1 parent 8c515d0 commit e9e3592
Show file tree
Hide file tree
Showing 6 changed files with 261 additions and 121 deletions.
43 changes: 41 additions & 2 deletions cooler/cli/cload.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,44 @@
)


def get_header(instream, comment_char='#'): # Copied from pairtools._headerops
'''Returns a header from the stream and an the reaminder of the stream
with the actual data.
Parameters
----------
instream : a file object
An input stream.
comment_char : str
The character prepended to header lines (use '@' when parsing sams,
'#' when parsing pairsams).
Returns
-------
header : list
The header lines, stripped of terminal spaces and newline characters.
remainder_stream : stream/file-like object
Stream with the remaining lines.
'''
header = []
if not comment_char:
raise ValueError('Please, provide a comment char!')
comment_byte = comment_char.encode()
# get peekable buffer for the instream
inbuffer = instream.buffer
current_peek = inbuffer.peek()
while current_peek.startswith(comment_byte):
# consuming a line from buffer guarantees
# that the remainder of the buffer starts
# with the beginning of the line.
line = inbuffer.readline()
# append line to header, since it does start with header
header.append(line.decode().strip())
# peek into the remainder of the instream
current_peek = inbuffer.peek()
# apparently, next line does not start with the comment
# return header and the instream, advanced to the beginning of the data
return header, instream

@cli.group()
def cload():
"""
Expand Down Expand Up @@ -485,15 +523,16 @@ def pairs(bins, pairs_path, cool_path, metadata, assembly, chunksize,
if pairs_path == '-':
f_in = sys.stdin
else:
f_in = pairs_path
f_in = pd.io.common.get_handle(pairs_path, mode='r',
compression='infer')[0]
f_in = get_header(f_in)[1] # We could save the header into metadata?

reader = pd.read_csv(
f_in,
sep='\t',
usecols=[input_field_numbers[name] for name in input_field_names],
names=input_field_names,
dtype=input_field_dtypes,
comment=comment_char,
iterator=True,
chunksize=chunksize)

Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
six
numpy>=1.9
scipy>=0.16
pandas>=0.19
pandas>=1.0
h5py>=2.5
click>=7
cytoolz
Expand All @@ -11,4 +11,4 @@ pyfaidx
pypairix
asciitree
pyyaml
simplejson
simplejson
192 changes: 96 additions & 96 deletions tests/data/toy.pairs
Original file line number Diff line number Diff line change
@@ -1,96 +1,96 @@
chr1 1 chr1 17 - + 0.1
chr1 2 chr1 18 - + 0.1
chr1 3 chr1 19 - + 0.1
chr1 4 chr1 20 - + 0.1
chr1 5 chr1 21 - + 0.1
chr1 6 chr1 22 - + 0.1
chr1 7 chr1 23 - + 0.1
chr1 8 chr1 24 - + 0.1
chr1 9 chr1 25 - + 0.1
chr1 10 chr1 26 - + 0.1
chr1 11 chr1 27 - + 0.1
chr1 12 chr1 28 - + 0.1
chr1 13 chr1 29 - + 0.1
chr1 14 chr1 30 - + 0.1
chr1 15 chr1 31 - + 0.1
chr1 16 chr1 32 - + 0.1
chr2 17 chr2 1 - + 0.1
chr2 18 chr2 2 - + 0.1
chr2 19 chr2 3 - + 0.1
chr2 20 chr2 4 - + 0.1
chr2 21 chr2 5 - + 0.1
chr2 22 chr2 6 - + 0.1
chr2 23 chr2 7 - + 0.1
chr2 24 chr2 8 - + 0.1
chr2 25 chr2 9 - + 0.1
chr2 26 chr2 10 - + 0.1
chr2 27 chr2 11 - + 0.1
chr2 28 chr2 12 - + 0.1
chr2 29 chr2 13 - + 0.1
chr2 30 chr2 14 - + 0.1
chr2 31 chr2 15 - + 0.1
chr2 32 chr2 16 - + 0.1
chr2 1 chr1 32 - + 0.1
chr2 2 chr1 31 - + 0.1
chr2 3 chr1 30 - + 0.1
chr2 4 chr1 29 - + 0.1
chr2 5 chr1 28 - + 0.1
chr2 6 chr1 27 - + 0.1
chr2 7 chr1 26 - + 0.1
chr2 8 chr1 25 - + 0.1
chr2 9 chr1 24 - + 0.1
chr2 10 chr1 23 - + 0.1
chr2 11 chr1 22 - + 0.1
chr2 12 chr1 21 - + 0.1
chr2 13 chr1 20 - + 0.1
chr2 14 chr1 19 - + 0.1
chr2 15 chr1 18 - + 0.1
chr2 16 chr1 17 - + 0.1
chr2 17 chr1 16 - + 0.1
chr2 18 chr1 15 - + 0.1
chr2 19 chr1 14 - + 0.1
chr2 20 chr1 13 - + 0.1
chr2 21 chr1 12 - + 0.1
chr2 22 chr1 11 - + 0.1
chr2 23 chr1 10 - + 0.1
chr2 24 chr1 9 - + 0.1
chr2 25 chr1 8 - + 0.1
chr2 26 chr1 7 - + 0.1
chr2 27 chr1 6 - + 0.1
chr2 28 chr1 5 - + 0.1
chr2 29 chr1 4 - + 0.1
chr2 30 chr1 3 - + 0.1
chr2 31 chr1 2 - + 0.1
chr2 32 chr1 1 - + 0.1
chr1 1 chr2 1 - + 0.1
chr1 2 chr2 2 - + 0.1
chr1 3 chr2 3 - + 0.1
chr1 4 chr2 4 - + 0.1
chr1 5 chr2 5 - + 0.1
chr1 6 chr2 6 - + 0.1
chr1 7 chr2 7 - + 0.1
chr1 8 chr2 8 - + 0.1
chr1 9 chr2 9 - + 0.1
chr1 10 chr2 10 - + 0.1
chr1 11 chr2 11 - + 0.1
chr1 12 chr2 12 - + 0.1
chr1 13 chr2 13 - + 0.1
chr1 14 chr2 14 - + 0.1
chr1 15 chr2 15 - + 0.1
chr1 16 chr2 16 - + 0.1
chr1 17 chr2 17 - + 0.1
chr1 18 chr2 18 - + 0.1
chr1 19 chr2 19 - + 0.1
chr1 20 chr2 20 - + 0.1
chr1 21 chr2 21 - + 0.1
chr1 22 chr2 22 - + 0.1
chr1 23 chr2 23 - + 0.1
chr1 24 chr2 24 - + 0.1
chr1 25 chr2 25 - + 0.1
chr1 26 chr2 26 - + 0.1
chr1 27 chr2 27 - + 0.1
chr1 28 chr2 28 - + 0.1
chr1 29 chr2 29 - + 0.1
chr1 30 chr2 30 - + 0.1
chr1 31 chr2 31 - + 0.1
chr1 32 chr2 32 - + 0.1
Read1 chr1 1 chr1 17 - + 0.1
Read2 chr1 2 chr1 18 - + 0.1
Read3 chr1 3 chr1 19 - + 0.1
Read4 chr1 4 chr1 20 - + 0.1
Read5 chr1 5 chr1 21 - + 0.1
Read6 chr1 6 chr1 22 - + 0.1
Read7 chr1 7 chr1 23 - + 0.1
Read8 chr1 8 chr1 24 - + 0.1
Read9 chr1 9 chr1 25 - + 0.1
Read10 chr1 10 chr1 26 - + 0.1
Read11 chr1 11 chr1 27 - + 0.1
Read12 chr1 12 chr1 28 - + 0.1
Read13 chr1 13 chr1 29 - + 0.1
Read14 chr1 14 chr1 30 - + 0.1
Read15 chr1 15 chr1 31 - + 0.1
Read16 chr1 16 chr1 32 - + 0.1
Read17 chr2 17 chr2 1 - + 0.1
Read18 chr2 18 chr2 2 - + 0.1
Read19 chr2 19 chr2 3 - + 0.1
Read20 chr2 20 chr2 4 - + 0.1
Read21 chr2 21 chr2 5 - + 0.1
Read22 chr2 22 chr2 6 - + 0.1
Read23 chr2 23 chr2 7 - + 0.1
Read24 chr2 24 chr2 8 - + 0.1
Read25 chr2 25 chr2 9 - + 0.1
Read26 chr2 26 chr2 10 - + 0.1
Read27 chr2 27 chr2 11 - + 0.1
Read28 chr2 28 chr2 12 - + 0.1
Read29 chr2 29 chr2 13 - + 0.1
Read30 chr2 30 chr2 14 - + 0.1
Read31 chr2 31 chr2 15 - + 0.1
Read32 chr2 32 chr2 16 - + 0.1
Read33 chr2 1 chr1 32 - + 0.1
Read34 chr2 2 chr1 31 - + 0.1
Read35 chr2 3 chr1 30 - + 0.1
Read36 chr2 4 chr1 29 - + 0.1
Read37 chr2 5 chr1 28 - + 0.1
Read38 chr2 6 chr1 27 - + 0.1
Read39 chr2 7 chr1 26 - + 0.1
Read40 chr2 8 chr1 25 - + 0.1
Read41 chr2 9 chr1 24 - + 0.1
Read42 chr2 10 chr1 23 - + 0.1
Read43 chr2 11 chr1 22 - + 0.1
Read44 chr2 12 chr1 21 - + 0.1
Read45 chr2 13 chr1 20 - + 0.1
Read46 chr2 14 chr1 19 - + 0.1
Read47 chr2 15 chr1 18 - + 0.1
Read48 chr2 16 chr1 17 - + 0.1
Read49 chr2 17 chr1 16 - + 0.1
Read50 chr2 18 chr1 15 - + 0.1
Read51 chr2 19 chr1 14 - + 0.1
Read52 chr2 20 chr1 13 - + 0.1
Read53 chr2 21 chr1 12 - + 0.1
Read54 chr2 22 chr1 11 - + 0.1
Read55 chr2 23 chr1 10 - + 0.1
Read56 chr2 24 chr1 9 - + 0.1
Read57 chr2 25 chr1 8 - + 0.1
Read58 chr2 26 chr1 7 - + 0.1
Read59 chr2 27 chr1 6 - + 0.1
Read60 chr2 28 chr1 5 - + 0.1
Read61 chr2 29 chr1 4 - + 0.1
Read62 chr2 30 chr1 3 - + 0.1
Read63 chr2 31 chr1 2 - + 0.1
Read64 chr2 32 chr1 1 - + 0.1
Read65 chr1 1 chr2 1 - + 0.1
Read66 chr1 2 chr2 2 - + 0.1
Read67 chr1 3 chr2 3 - + 0.1
Read68 chr1 4 chr2 4 - + 0.1
Read69 chr1 5 chr2 5 - + 0.1
Read70 chr1 6 chr2 6 - + 0.1
Read71 chr1 7 chr2 7 - + 0.1
Read72 chr1 8 chr2 8 - + 0.1
Read73 chr1 9 chr2 9 - + 0.1
Read74 chr1 10 chr2 10 - + 0.1
Read75 chr1 11 chr2 11 - + 0.1
Read76 chr1 12 chr2 12 - + 0.1
Read77 chr1 13 chr2 13 - + 0.1
Read78 chr1 14 chr2 14 - + 0.1
Read79 chr1 15 chr2 15 - + 0.1
Read80 chr1 16 chr2 16 - + 0.1
Read81 chr1 17 chr2 17 - + 0.1
Read82 chr1 18 chr2 18 - + 0.1
Read83 chr1 19 chr2 19 - + 0.1
Read84 chr1 20 chr2 20 - + 0.1
Read85 chr1 21 chr2 21 - + 0.1
Read86 chr1 22 chr2 22 - + 0.1
Read87 chr1 23 chr2 23 - + 0.1
Read88 chr1 24 chr2 24 - + 0.1
Read89 chr1 25 chr2 25 - + 0.1
Read90 chr1 26 chr2 26 - + 0.1
Read91 chr1 27 chr2 27 - + 0.1
Read92 chr1 28 chr2 28 - + 0.1
Read93 chr1 29 chr2 29 - + 0.1
Read94 chr1 30 chr2 30 - + 0.1
Read95 chr1 31 chr2 31 - + 0.1
Read96 chr1 32 chr2 32 - + 0.1
96 changes: 96 additions & 0 deletions tests/data/toy_hash.pairs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
Read#1 chr1 1 chr1 17 - + 0.1
Read#2 chr1 2 chr1 18 - + 0.1
Read#3 chr1 3 chr1 19 - + 0.1
Read#4 chr1 4 chr1 20 - + 0.1
Read#5 chr1 5 chr1 21 - + 0.1
Read#6 chr1 6 chr1 22 - + 0.1
Read#7 chr1 7 chr1 23 - + 0.1
Read#8 chr1 8 chr1 24 - + 0.1
Read#9 chr1 9 chr1 25 - + 0.1
Read#10 chr1 10 chr1 26 - + 0.1
Read#11 chr1 11 chr1 27 - + 0.1
Read#12 chr1 12 chr1 28 - + 0.1
Read#13 chr1 13 chr1 29 - + 0.1
Read#14 chr1 14 chr1 30 - + 0.1
Read#15 chr1 15 chr1 31 - + 0.1
Read#16 chr1 16 chr1 32 - + 0.1
Read#17 chr2 17 chr2 1 - + 0.1
Read#18 chr2 18 chr2 2 - + 0.1
Read#19 chr2 19 chr2 3 - + 0.1
Read#20 chr2 20 chr2 4 - + 0.1
Read#21 chr2 21 chr2 5 - + 0.1
Read#22 chr2 22 chr2 6 - + 0.1
Read#23 chr2 23 chr2 7 - + 0.1
Read#24 chr2 24 chr2 8 - + 0.1
Read#25 chr2 25 chr2 9 - + 0.1
Read#26 chr2 26 chr2 10 - + 0.1
Read#27 chr2 27 chr2 11 - + 0.1
Read#28 chr2 28 chr2 12 - + 0.1
Read#29 chr2 29 chr2 13 - + 0.1
Read#30 chr2 30 chr2 14 - + 0.1
Read#31 chr2 31 chr2 15 - + 0.1
Read#32 chr2 32 chr2 16 - + 0.1
Read#33 chr2 1 chr1 32 - + 0.1
Read#34 chr2 2 chr1 31 - + 0.1
Read#35 chr2 3 chr1 30 - + 0.1
Read#36 chr2 4 chr1 29 - + 0.1
Read#37 chr2 5 chr1 28 - + 0.1
Read#38 chr2 6 chr1 27 - + 0.1
Read#39 chr2 7 chr1 26 - + 0.1
Read#40 chr2 8 chr1 25 - + 0.1
Read#41 chr2 9 chr1 24 - + 0.1
Read#42 chr2 10 chr1 23 - + 0.1
Read#43 chr2 11 chr1 22 - + 0.1
Read#44 chr2 12 chr1 21 - + 0.1
Read#45 chr2 13 chr1 20 - + 0.1
Read#46 chr2 14 chr1 19 - + 0.1
Read#47 chr2 15 chr1 18 - + 0.1
Read#48 chr2 16 chr1 17 - + 0.1
Read#49 chr2 17 chr1 16 - + 0.1
Read#50 chr2 18 chr1 15 - + 0.1
Read#51 chr2 19 chr1 14 - + 0.1
Read#52 chr2 20 chr1 13 - + 0.1
Read#53 chr2 21 chr1 12 - + 0.1
Read#54 chr2 22 chr1 11 - + 0.1
Read#55 chr2 23 chr1 10 - + 0.1
Read#56 chr2 24 chr1 9 - + 0.1
Read#57 chr2 25 chr1 8 - + 0.1
Read#58 chr2 26 chr1 7 - + 0.1
Read#59 chr2 27 chr1 6 - + 0.1
Read#60 chr2 28 chr1 5 - + 0.1
Read#61 chr2 29 chr1 4 - + 0.1
Read#62 chr2 30 chr1 3 - + 0.1
Read#63 chr2 31 chr1 2 - + 0.1
Read#64 chr2 32 chr1 1 - + 0.1
Read#65 chr1 1 chr2 1 - + 0.1
Read#66 chr1 2 chr2 2 - + 0.1
Read#67 chr1 3 chr2 3 - + 0.1
Read#68 chr1 4 chr2 4 - + 0.1
Read#69 chr1 5 chr2 5 - + 0.1
Read#70 chr1 6 chr2 6 - + 0.1
Read#71 chr1 7 chr2 7 - + 0.1
Read#72 chr1 8 chr2 8 - + 0.1
Read#73 chr1 9 chr2 9 - + 0.1
Read#74 chr1 10 chr2 10 - + 0.1
Read#75 chr1 11 chr2 11 - + 0.1
Read#76 chr1 12 chr2 12 - + 0.1
Read#77 chr1 13 chr2 13 - + 0.1
Read#78 chr1 14 chr2 14 - + 0.1
Read#79 chr1 15 chr2 15 - + 0.1
Read#80 chr1 16 chr2 16 - + 0.1
Read#81 chr1 17 chr2 17 - + 0.1
Read#82 chr1 18 chr2 18 - + 0.1
Read#83 chr1 19 chr2 19 - + 0.1
Read#84 chr1 20 chr2 20 - + 0.1
Read#85 chr1 21 chr2 21 - + 0.1
Read#86 chr1 22 chr2 22 - + 0.1
Read#87 chr1 23 chr2 23 - + 0.1
Read#88 chr1 24 chr2 24 - + 0.1
Read#89 chr1 25 chr2 25 - + 0.1
Read#90 chr1 26 chr2 26 - + 0.1
Read#91 chr1 27 chr2 27 - + 0.1
Read#92 chr1 28 chr2 28 - + 0.1
Read#93 chr1 29 chr2 29 - + 0.1
Read#94 chr1 30 chr2 30 - + 0.1
Read#95 chr1 31 chr2 31 - + 0.1
Read#96 chr1 32 chr2 32 - + 0.1
Loading

0 comments on commit e9e3592

Please sign in to comment.