Skip to content

Commit

Permalink
Fix get_site_map() and improve parse_tr()
Browse files Browse the repository at this point in the history
  • Loading branch information
mikeqfu committed Sep 14, 2023
1 parent c6e5ba3 commit c306b10
Showing 1 changed file with 69 additions and 63 deletions.
132 changes: 69 additions & 63 deletions pyrcs/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,50 +62,9 @@ def _move_element_to_end(text_, char='\t\t'):
# break


def parse_tr(trs, ths, sep=' / ', as_dataframe=False):
"""
Parse a list of parsed HTML <tr> elements.
See also [`PT-1 <https://stackoverflow.com/questions/28763891/>`_].
:param trs: contents under ``<tr>`` tags of a web page.
:type trs: bs4.ResultSet | list
:param ths: list of column names (usually under a ``<th>`` tag) of a requested table.
:type ths: list | bs4.element.Tag
:param sep: separator that replaces the one in the raw data.
:type sep: str | None
:param as_dataframe: whether to return the parsed data in tabular form
:type as_dataframe: bool
:return: a list of lists that each comprises a row of the requested table
:rtype: pandas.DataFrame | typing.List[list]
**Example**::
>>> from pyrcs.parser import parse_tr
>>> import requests
>>> import bs4
>>> example_url = 'http://www.railwaycodes.org.uk/elrs/elra.shtm'
>>> source = requests.get(example_url)
>>> parsed_text = bs4.BeautifulSoup(markup=source.content, features='html.parser')
>>> ths_dat = [th.text for th in parsed_text.find_all('th')]
>>> trs_dat = parsed_text.find_all(name='tr')
>>> tables_list = parse_tr(trs=trs_dat, ths=ths_dat) # returns a list of lists
>>> type(tables_list)
list
>>> len(tables_list) // 100
1
>>> tables_list[0]
['AAL',
'Ashendon and Aynho Line',
'0.00 - 18.29',
'Ashendon Junction',
'Now NAJ3']
"""

def _prep_records(trs, ths, sep=' / '):
ths_len = len(ths)

records = []
row_spanned = []

Expand Down Expand Up @@ -136,7 +95,13 @@ def parse_tr(trs, ths, sep=' / ', as_dataframe=False):

records.append(data)

return records, row_spanned


def _check_row_spanned(records, row_spanned):
if row_spanned:
records_ = records.copy()

row_spanned_dict = collections.defaultdict(list)
for i, *to_repeat in row_spanned:
row_spanned_dict[i].append(to_repeat)
Expand All @@ -147,28 +112,68 @@ def parse_tr(trs, ths, sep=' / ', as_dataframe=False):
k = i + j
# if (dat in records[i]) and (dat != '\xa0'): # and (idx < len(records[i]) - 1):
# idx += np.abs(records[i].index(dat) - idx, dtype='int64')
k_len = len(records[k])
if k_len < len(records[i]):
k_len = len(records_[k])
if k_len < len(records_[i]):
if k_len == idx:
records[k].insert(idx, dat)
records_[k].insert(idx, dat)
elif k_len > idx:
if records[k][idx] != '':
records[k].insert(idx, dat)
if records_[k][idx] != '':
records_[k].insert(idx, dat)
else: # records[k][idx] == '':
records[k][idx] = dat

# if row_spanned:
# for x in row_spanned:
# for j in range(1, x[2]):
# # Add value in next tr
# idx = x[0] + j
# # assert isinstance(idx, int)
# if x[1] >= len(tbl_lst[idx]):
# tbl_lst[idx].insert(x[1], x[3])
# elif x[3] in tbl_lst[x[0]]:
# tbl_lst[idx].insert(tbl_lst[x[0]].index(x[3]), x[3])
# else:
# tbl_lst[idx].insert(x[1] + 1, x[3])
records_[k][idx] = dat

else:
records_ = records

return records_


def parse_tr(trs, ths, sep=' / ', as_dataframe=False):
"""
Parse a list of parsed HTML <tr> elements.
See also [`PT-1 <https://stackoverflow.com/questions/28763891/>`_].
:param trs: contents under ``<tr>`` tags of a web page.
:type trs: bs4.ResultSet | list
:param ths: list of column names (usually under a ``<th>`` tag) of a requested table.
:type ths: list | bs4.element.Tag
:param sep: separator that replaces the one in the raw data.
:type sep: str | None
:param as_dataframe: whether to return the parsed data in tabular form
:type as_dataframe: bool
:return: a list of lists that each comprises a row of the requested table
:rtype: pandas.DataFrame | typing.List[list]
**Example**::
>>> from pyrcs.parser import parse_tr
>>> import requests
>>> import bs4
>>> example_url = 'http://www.railwaycodes.org.uk/elrs/elra.shtm'
>>> source = requests.get(example_url)
>>> parsed_text = bs4.BeautifulSoup(markup=source.content, features='html.parser')
>>> ths_dat = [th.text for th in parsed_text.find_all('th')]
>>> trs_dat = parsed_text.find_all(name='tr')
>>> tables_list = parse_tr(trs=trs_dat, ths=ths_dat) # returns a list of lists
>>> type(tables_list)
list
>>> len(tables_list) // 100
1
>>> tables_list[0]
['AAL',
'Ashendon and Aynho Line',
'0.00 - 18.29',
'Ashendon Junction',
'Now NAJ3']
"""

records, row_spanned = _prep_records(trs=trs, ths=ths, sep=sep)

records = _check_row_spanned(records, row_spanned)

if isinstance(ths, bs4.Tag):
column_names = [th.text.strip() for th in ths.find_all('th')]
Expand Down Expand Up @@ -401,7 +406,8 @@ def _get_site_map(source):
for h3 in h3s:
h3_title = h3.get_text(strip=True)

h3_dl = h3.find_next_sibling(name='dl')
# h3_dl = h3.find_next_sibling(name='dl')
h3_dl = h3.find_next(name='dl')

h3_dl_dts = h3_dl.find_all(name='dt')

Expand Down

0 comments on commit c306b10

Please sign in to comment.