From 69df19d4e3692ca6f1642c7e029a61fd6ec72fa3 Mon Sep 17 00:00:00 2001 From: mikeqfu Date: Sun, 21 Mar 2021 21:34:55 +0000 Subject: [PATCH] Fix a bug in Stations.collect_station_data_by_initial() --- pyrcs/other_assets/station.py | 302 +++++++++++++++++----------------- 1 file changed, 155 insertions(+), 147 deletions(-) diff --git a/pyrcs/other_assets/station.py b/pyrcs/other_assets/station.py index ccb4d5e..039b825 100644 --- a/pyrcs/other_assets/station.py +++ b/pyrcs/other_assets/station.py @@ -2,6 +2,7 @@ Collect `railway station data `_. """ +import collections import copy import itertools import os @@ -15,7 +16,7 @@ import requests from pyhelpers.dir import cd, validate_input_data_dir from pyhelpers.ops import fake_requests_headers -from pyhelpers.store import load_pickle, save_pickle, save_json, load_json +from pyhelpers.store import load_pickle, save_pickle, load_json from pyrcs.utils import cd_dat, get_catalogue, get_last_updated_date, homepage_url, \ parse_location_name, parse_table, is_internet_connected, print_conn_err, print_connection_error @@ -26,9 +27,8 @@ class Stations: A class for collecting railway station data. :param data_dir: name of data directory, defaults to ``None`` - :type data_dir: str, None - :param verbose: whether to print relevant information in console as the function runs, - defaults to ``True`` + :type data_dir: str or None + :param verbose: whether to print relevant information in console, defaults to ``True`` :type verbose: bool or int :ivar str Name: name of the data @@ -41,8 +41,8 @@ class Stations: :ivar str DataDir: path to the data directory :ivar str CurrentDataDir: path to the current data directory - :ivar str StnKey: key of the dict-type data of railway stations - :ivar str StnPickle: name of the pickle file of railway station data + :ivar str StnKey: key of the dict-type data of railway station locations + :ivar str StnPickle: name of the pickle file of railway station locations :ivar str BilingualKey: key of the dict-type data of bilingual names :ivar str SpStnNameSignKey: key of the dict-type data of sponsored station name signs :ivar str NSFOKey: key of the dict-type data of stations not served by SFO @@ -80,8 +80,8 @@ def __init__(self, data_dir=None, verbose=True): self.LUDKey = 'Last updated date' # key to last updated date self.LUD = get_last_updated_date(url=self.SourceURL, parsed=True, as_date_type=False) - self.StnKey = 'Railway station data' - self.StnPickle = self.StnKey.lower().replace(" ", "-") + self.StnKey = 'Mileages, operators and grid coordinates' + self.StnPickle = self.StnKey.lower().replace(",", "").replace(" ", "-") self.BilingualKey = 'Bilingual names' self.SpStnNameSignKey = 'Sponsored signs' @@ -94,14 +94,14 @@ def __init__(self, data_dir=None, verbose=True): if data_dir: self.DataDir = validate_input_data_dir(data_dir) else: - self.DataDir = cd_dat("other-assets", self.Name.lower()) + self.DataDir = cd_dat("other-assets", self.Key.lower().replace(" ", "-")) self.CurrentDataDir = copy.copy(self.DataDir) def _cdd_stn(self, *sub_dir, **kwargs): """ Change directory to package data directory and sub-directories (and/or a file). - The directory for this module: ``"\\dat\\other-assets\\stations"``. + The directory for this module: ``"dat\\other-assets\\stations"``. :param sub_dir: sub-directory or sub-directories (and/or a file) :type sub_dir: str @@ -122,11 +122,9 @@ def get_station_data_catalogue(self, update=False, verbose=False): """ Get catalogue of railway station data. - :param update: whether to check on update and proceed to update the package data, - defaults to ``False`` + :param update: whether to do an update check (for the package data), defaults to ``False`` :type update: bool - :param verbose: whether to print relevant information in console as the function runs, - defaults to ``False`` + :param verbose: whether to print relevant information in console, defaults to ``False`` :type verbose: bool or int :return: catalogue of railway station data :rtype: dict @@ -141,27 +139,29 @@ def get_station_data_catalogue(self, update=False, verbose=False): >>> stn_data_cat = stn.get_station_data_catalogue() >>> type(stn_data_cat) - dict + collections.OrderedDict >>> list(stn_data_cat.keys()) - ['Railway station data', + ['Mileages, operators and grid coordinates', + 'Bilingual names', 'Sponsored signs', + 'Not served by SFO', 'International', 'Trivia', 'Access rights', - 'Barrier error codes'] + 'Barrier error codes', + 'London Underground'] """ cat_json = '-'.join(x for x in urllib.parse.urlparse(self.SourceURL).path.replace( - '.shtm', '.json').split('/') if x) + '.shtm', '.pickle').split('/') if x) path_to_cat = cd_dat("catalogue", cat_json) if os.path.isfile(path_to_cat) and not update: - catalogue = load_json(path_to_cat) + catalogue = load_pickle(path_to_cat) else: if verbose == 2: - print("Collecting a catalogue of {} data".format(self.StnKey.lower()), - end=" ... ") + print("Collecting a catalogue of {}".format(self.StnKey.lower()), end=" ... ") try: source = requests.get(self.SourceURL, headers=fake_requests_headers()) @@ -179,24 +179,25 @@ def get_station_data_catalogue(self, update=False, verbose=False): hot_soup = {a.text: urllib.parse.urljoin(self.SourceURL, a.get('href')) for a in cold_soup.find_all('a')} - catalogue = {self.StnKey: None} + catalogue = collections.OrderedDict() for k, v in hot_soup.items(): sub_cat = get_catalogue(v, update=True, confirmation_required=False, json_it=False) if sub_cat != hot_soup: - if k == 'Introduction': - catalogue.update({self.StnKey: {k: v, **sub_cat}}) + if k in sub_cat.keys(): + sub_cat.pop(k) + elif 'Introduction' in sub_cat.keys(): + sub_cat.pop('Introduction') + if v in sub_cat.values(): + catalogue[k] = sub_cat else: - catalogue.update({k: sub_cat}) + catalogue[k] = {'Introduction': v, **sub_cat} else: - if k in ('Bilingual names', 'Not served by SFO'): - catalogue[self.StnKey].update({k: v}) - else: - catalogue.update({k: v}) + catalogue.update({k: v}) print("Done.") if verbose == 2 else "" - save_json(catalogue, path_to_cat, verbose=verbose) + save_pickle(catalogue, path_to_cat, verbose=verbose) except Exception as e: print("Failed. {}".format(e)) @@ -204,6 +205,14 @@ def get_station_data_catalogue(self, update=False, verbose=False): return catalogue + @staticmethod + def _parse_degrees(x): + if x == '': + z = np.nan + else: + z = float(x.replace('c.', '') if x.startswith('c.') else x) + return z + @staticmethod def _parse_owner_and_operator(x): """ @@ -220,31 +229,46 @@ def _parse_owner_and_operator(x): pdate_pat = re.compile(r'from\s\d+\s\w+\s[0-9]{4} to \d+ \w+ [0-9]{4}') try: - current_op, past_op = [y.rstrip(', ').strip(',').strip() for y in x_.split('\\r')] + op_lst = [y.rstrip(', ').strip(',').strip() for y in re.split(r'\(\[|\r|\\r', x_)] + op_lst = [p for p in op_lst if p] + if len(op_lst) >= 2: + current_op, past_op = op_lst[0], op_lst[1:] + else: + current_op, past_op = op_lst[0], '' except ValueError: try: - current_op, past_op = [y.rstrip(', ').strip(',').strip() for y in x_.split('\r')] + current_op, past_op = x_.split('\\r, ')[0], x_.split('\\r, ')[1:] except ValueError: - current_op, past_op = x_, None + current_op, past_op = x_, '' - # Current operator - current_name = re.search(cname_pat, current_op) - if current_name and current_op != '': - current_name = current_name.group(0) - else: - current_name = current_op - current_from = re.search(cdate_pat, current_op) - if current_from: - current_from = current_from.group(0) + def get_current_name_date(nd): + n = re.search(cname_pat, nd) + if n and nd != '': + n = n.group(0) + else: + n = nd + d_from = re.search(cdate_pat, nd) + if d_from: + d_from = d_from.group(0) + return [(n, d_from)] + + def get_past_name_date(nd): + dft = re.findall(pdate_pat, nd) + ns = [ + y.strip().lstrip('([') for y in re.split(pdate_pat, nd) if y.strip()] + if not dft: + dft = [''] * len(ns) + return [(n, d) for n, d in zip(ns, dft)] - current_operator = [(current_name, current_from)] + # Current operator + current_operator = get_current_name_date(current_op) if past_op: # Past operators - past_dates = re.findall(pdate_pat, past_op) - past_names = [y.strip().lstrip('([') for y in re.split(pdate_pat, past_op) if y.strip()] + if isinstance(past_op, str): + past_op = [past_op] - past_operators = [(n, d) for n, d in zip(past_names, past_dates)] + past_operators = [get_past_name_date(x) for x in past_op] # for z in parsed_text: # # Operators names @@ -263,7 +287,7 @@ def _parse_owner_and_operator(x): return operators - def extended_info(self, info_dat, name): + def _extended_info(self, info_dat, name): """ Get extended information of the owners/operators. @@ -275,50 +299,52 @@ def extended_info(self, info_dat, name): :rtype: pandas.DataFrame """ - temp = list(info_dat.map(self._parse_owner_and_operator)) - length = len(max(temp, key=len)) + temp = list( + info_dat.map(lambda x: self._parse_owner_and_operator(x) if x else [('', '')])) + + temp_lst = [] + for item in temp: + sub_lst = [] + for sub_item in item: + if isinstance(sub_item, list): + sub_item = sub_item[0] + sub_lst.append(sub_item) + temp_lst.append(sub_lst) + + length = len(max(temp_lst, key=len)) col_names_current = [name, name + '_since'] prev_no = list( - itertools.chain.from_iterable(itertools.repeat(x, 2) for x in list(range(1, length)))) + itertools.chain.from_iterable(itertools.repeat(x, 2) for x in range(1, length))) col_names_ = zip(col_names_current * (length - 1), prev_no) - col_names = col_names_current + ['_'.join(['Prev', x, str(d)]).replace('_since', '_Period') - for x, d in col_names_] + col_names = col_names_current + [ + '_'.join(['Prev', x, str(d)]).replace('_since', '_Period') for x, d in col_names_] - for i in range(len(temp)): - if len(temp[i]) < length: - temp[i] += [(None, None)] * (length - len(temp[i])) + for i in range(len(temp_lst)): + if len(temp_lst[i]) < length: + temp_lst[i] += [('', '')] * (length - len(temp_lst[i])) - temp2 = pd.DataFrame(temp) - extended_info = [temp2[c].apply(pd.Series) for c in temp2.columns] - extended_info = pd.concat(extended_info, axis=1, sort=False) + temp_lst_ = [list(itertools.chain.from_iterable(x)) for x in temp_lst] + extended_info = pd.DataFrame(temp_lst_) + # _extended_info = [temp2[c].apply(pd.Series) for c in temp2.columns] + # _extended_info = pd.concat(_extended_info, axis=1, sort=False) extended_info.columns = col_names - return extended_info + extended_info.fillna('', inplace=True) - @staticmethod - def _parse_degrees(x): - if x == '': - z = np.nan - else: - z = float(x.replace('c.', '') if x.startswith('c.') else x) - return z + return extended_info def collect_station_data_by_initial(self, initial, update=False, verbose=False): """ - Collect `railway station data `_ - for the given ``initial`` letter. + Collect `data of railway station locations + `_ for the given ``initial`` letter. - :param initial: initial letter of station data - (including the station name, ELR, mileage, status, owner, operator, - degrees of longitude and latitude, and grid reference) for specifying URL + :param initial: initial letter of locations of the railway station data :type initial: str - :param update: whether to check on update and proceed to update the package data, - defaults to ``False`` + :param update: whether to do an update check (for the package data), defaults to ``False`` :type update: bool - :param verbose: whether to print relevant information in console as the function runs, - defaults to ``False`` - :type verbose: bool, int - :return: railway station data for the given ``initial`` letter and + :param verbose: whether to print relevant information in console, defaults to ``False`` + :type verbose: bool or int + :return: data of railway station locations beginning with ``initial`` and date of when the data was last updated :rtype: dict @@ -336,13 +362,13 @@ def collect_station_data_by_initial(self, initial, update=False, verbose=False): >>> list(sa.keys()) ['A', 'Last updated date'] - >>> print(sa['A'].head()) - Station ELR ... Prev_Operator_6 Prev_Operator_Period_6 - 0 Abbey Wood NKL ... None None - 1 Abbey Wood XRS3 ... None None - 2 Aber CAR ... None None - 3 Abercynon North ABD ... None None - 4 ABD ... None None + >>> sa['A'].head() + Station ELR ... Prev_Operator_6 Prev_Operator_Period_6 + 0 Abbey Wood NKL ... + 1 Abbey Wood XRS3 ... + 2 Aber CAR ... + 3 Abercynon CAM ... + 4 Abercynon ABD ... [5 rows x 28 columns] """ @@ -354,25 +380,24 @@ def collect_station_data_by_initial(self, initial, update=False, verbose=False): railway_station_data = load_pickle(path_to_pickle) else: - url = self.SourceURL.replace('station0', 'station{}'.format(initial.lower())) - railway_station_data = {beginning_with: None, self.LUDKey: None} if verbose == 2: - print("Collecting data of {} beginning with \"{}\"".format( + print("Collecting {} of locations beginning with \"{}\"".format( self.StnKey.lower(), beginning_with), end=" ... ") stn_data_catalogue = self.get_station_data_catalogue() + stn_data_initials = list( + stn_data_catalogue['Mileages, operators and grid coordinates'].keys()) - if beginning_with not in list(stn_data_catalogue[self.StnKey].keys()): - if verbose == 2: - print("No data is available.") - # print("No data is available for signal box codes " - # "beginning with \"{}\".".format(beginning_with)) - # railway_station_table, last_updated_date = None, None - pass + if beginning_with not in stn_data_initials: + if verbose: + print("No data is available for the locations beginning with \"{}\".".format( + beginning_with)) else: + url = self.SourceURL.replace('station0', 'station{}'.format(initial.lower())) + try: source = requests.get(url, headers=fake_requests_headers()) except requests.exceptions.ConnectionError: @@ -393,8 +418,8 @@ def collect_station_data_by_initial(self, initial, update=False, verbose=False): lambda x: x.split(' ') if not re.match('^[Ss]ee ', x) else [x]) temp_elr_len = temp_elr.map(len).sum() if max(temp_degree_len, temp_elr_len) > len(stn_dat): - temp_col = ['ELR', 'Degrees Longitude', 'Degrees Latitude', - 'Grid Reference'] + temp_col = ['ELR', 'Status', + 'Degrees Longitude', 'Degrees Latitude', 'Grid Reference'] idx = [j for j in stn_dat.index if max(len(temp_degree[j]), len(temp_elr[j])) > 1] @@ -406,14 +431,19 @@ def collect_station_data_by_initial(self, initial, update=False, verbose=False): for c in col: x_ = stn_dat.loc[i, c] if c in temp_col: - y = x_.split(' ') + x_ = re.sub(r' \(\[\'|\', \'|\']\)', ' ', x_) + if '\r' in x_ or '\\r' in x_: + x_ = re.sub(r'\r|\\r', ',', x_).strip().split(',') + y = [z.strip() for z in x_] + else: + y = x_.strip().split(' ') if len(y) == 1: y = y * t temp_val.append(y) elif c == 'Mileage': y = re.findall(r'\d+m \d+ch|\d+\.\d+km|\w+', x_) if len(y) > t: - y = re.findall(r'\d+m \d+ch', x_) + y = re.findall(r'\d+m \d+ch|unknown', x_) temp_val.append(y) else: temp_val.append([x_] * t) @@ -422,8 +452,7 @@ def collect_station_data_by_initial(self, initial, update=False, verbose=False): pd.DataFrame(np.array(temp_val, dtype=object).T, columns=col)) stn_dat.drop(idx, axis='index', inplace=True) - stn_dat = pd.concat( - [stn_dat] + temp_vals, axis=0, ignore_index=True) + stn_dat = pd.concat([stn_dat] + temp_vals, axis=0, ignore_index=True) stn_dat.sort_values(['Station'], inplace=True) @@ -437,32 +466,14 @@ def collect_station_data_by_initial(self, initial, update=False, verbose=False): stn_dat[['Station', 'Station_Note']] = stn_dat.Station.map( parse_location_name).apply(pd.Series) - # Owner - owners = self.extended_info(stn_dat.Owner, name='Owner') + # Owners + owners = self._extended_info(stn_dat.Owner, name='Owner') stn_dat.drop('Owner', axis=1, inplace=True) stn_dat = stn_dat.join(owners) - # Operator - # temp = list(stn_dat.Operator.map(self._parse_owner_and_operator)) - # length = len(max(temp, key=len)) - # col_names_current = ['Operator', 'Date'] - # prev_no = list(itertools.chain.from_iterable( - # itertools.repeat(x, 2) for x in list(range(1, length)))) - # col_names = zip(col_names_current * (length - 1), prev_no) - # col_names = col_names_current + [ - # '_'.join(['Prev', x, str(d)]) for x, d in col_names] - # - # for i in range(len(temp)): - # if len(temp[i]) < length: - # temp[i] += [(None, None)] * (length - len(temp[i])) - # - # temp2 = pd.DataFrame(temp) - # operators = [temp2[c].apply(pd.Series) for c in temp2.columns] - # operators = pd.concat(operators, axis=1, sort=False) - # operators.columns = col_names - - operators = self.extended_info(stn_dat.Operator, name='Operator') + # Operators + operators = self._extended_info(stn_dat.Operator, name='Operator') stn_dat.drop('Operator', axis=1, inplace=True) stn_dat = stn_dat.join(operators) @@ -483,24 +494,19 @@ def collect_station_data_by_initial(self, initial, update=False, verbose=False): def fetch_station_data(self, update=False, pickle_it=False, data_dir=None, verbose=False): """ - Fetch `railway station data `_ - from local backup. + Fetch `data of railway station locations + `_ + (incl. mileages, operators and grid coordinates) from local backup. - :param update: whether to check on update and proceed to update the package data, - defaults to ``False`` + :param update: whether to do an update check (for the package data), defaults to ``False`` :type update: bool - :param pickle_it: whether to replace the current package data with newly collected data, - defaults to ``False`` + :param pickle_it: whether to save the data as a pickle file, defaults to ``False`` :type pickle_it: bool - :param data_dir: name of package data folder, defaults to ``None`` - :type data_dir: str, None - :param verbose: whether to print relevant information in console as the function runs, - defaults to ``False`` - :type verbose: bool, int - :return: railway station data - (including the station name, ELR, mileage, status, owner, operator, - degrees of longitude and latitude, and grid reference) and - date of when the data was last updated + :param data_dir: name of a folder where the pickle file is to be saved, defaults to ``None`` + :type data_dir: str or None + :param verbose: whether to print relevant information in console, defaults to ``False`` + :type verbose: bool or int + :return: data of railway station locations and date of when the data was last updated :rtype: dict **Example**:: @@ -515,20 +521,20 @@ def fetch_station_data(self, update=False, pickle_it=False, data_dir=None, verbo >>> type(rail_stn_data) dict >>> list(rail_stn_data.keys()) - ['Railway station data', 'Last updated date'] + ['Mileages, operators and grid coordinates', 'Last updated date'] - >>> rail_stn_dat = rail_stn_data['Railway station data'] + >>> rail_stn_dat = rail_stn_data[stn.StnKey] >>> type(rail_stn_dat) pandas.core.frame.DataFrame - >>> print(rail_stn_dat.head()) - Station ELR ... Prev_Operator_6 Prev_Operator_Period_6 - 2606 MRL1 ... None None - 723 TAT ... None None - 89 ABD ... None None - 90 CAM ... None None - 85 Abbey Wood NKL ... None None - [5 rows x 32 columns] + >>> rail_stn_dat.head() + Station ELR ... Prev_Operator_6 Prev_Operator_Period_6 + 0 Abbey Wood XRS3 ... + 1 Abbey Wood NKL ... + 2 Aber CAR ... + 3 Abercynon ABD ... + 4 Abercynon CAM ... + [5 rows x 30 columns] """ verbose_ = False if (data_dir or not verbose) else (2 if verbose == 2 else True) @@ -553,6 +559,8 @@ def fetch_station_data(self, update=False, pickle_it=False, data_dir=None, verbo stn_data = stn_data.where(pd.notna(stn_data), None) stn_data.sort_values(['Station'], inplace=True) + stn_data.index = range(len(stn_data)) + last_updated_dates = (d[self.LUDKey] for d in data_sets) latest_update_date = max(d for d in last_updated_dates if d is not None)