diff --git a/.vs/ProjectSettings.json b/.vs/ProjectSettings.json new file mode 100644 index 0000000..f8b4888 --- /dev/null +++ b/.vs/ProjectSettings.json @@ -0,0 +1,3 @@ +{ + "CurrentProjectSetting": null +} \ No newline at end of file diff --git a/.vs/VSWorkspaceState.json b/.vs/VSWorkspaceState.json new file mode 100644 index 0000000..6e20768 --- /dev/null +++ b/.vs/VSWorkspaceState.json @@ -0,0 +1,8 @@ +{ + "ExpandedNodes": [ + "", + "\\pygetpapers" + ], + "SelectedNode": "\\pygetpapers\\pygetpapers.py", + "PreviewInSolutionExplorer": false +} \ No newline at end of file diff --git a/.vs/getpaper/v17/.suo b/.vs/getpaper/v17/.suo new file mode 100644 index 0000000..a3ace1c Binary files /dev/null and b/.vs/getpaper/v17/.suo differ diff --git a/.vs/slnx.sqlite b/.vs/slnx.sqlite new file mode 100644 index 0000000..ab7a7da Binary files /dev/null and b/.vs/slnx.sqlite differ diff --git a/pygetpapers/__pycache__/pygetpapers.cpython-37.pyc b/pygetpapers/__pycache__/pygetpapers.cpython-37.pyc index ca97ad5..8e6bf6b 100644 Binary files a/pygetpapers/__pycache__/pygetpapers.cpython-37.pyc and b/pygetpapers/__pycache__/pygetpapers.cpython-37.pyc differ diff --git a/pygetpapers/__pycache__/repositoryinterface.cpython-37.pyc b/pygetpapers/__pycache__/repositoryinterface.cpython-37.pyc index 47099a1..e5d34bc 100644 Binary files a/pygetpapers/__pycache__/repositoryinterface.cpython-37.pyc and b/pygetpapers/__pycache__/repositoryinterface.cpython-37.pyc differ diff --git a/pygetpapers/config.ini b/pygetpapers/config.ini index 695cf08..f5500b9 100644 --- a/pygetpapers/config.ini +++ b/pygetpapers/config.ini @@ -1,5 +1,5 @@ [pygetpapers] -version=1.1.7 +version=1.1.8 [europe_pmc] query_url=https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST diff --git a/pygetpapers/pygetpapers.py b/pygetpapers/pygetpapers.py index c185cab..40c232a 100644 --- a/pygetpapers/pygetpapers.py +++ b/pygetpapers/pygetpapers.py @@ -159,7 +159,6 @@ def add_terms_from_file(self): elif self.query_namespace[NOTTERMS]: raise PygetpapersError("Please provide a query with not") - def check_query_logic_and_run(self): """Checks the logic in query_namespace and runs pygetpapers for the given query """ diff --git a/pygetpapers/repository/__pycache__/arxiv.cpython-37.pyc b/pygetpapers/repository/__pycache__/arxiv.cpython-37.pyc index 5400839..932e8ae 100644 Binary files a/pygetpapers/repository/__pycache__/arxiv.cpython-37.pyc and b/pygetpapers/repository/__pycache__/arxiv.cpython-37.pyc differ diff --git a/pygetpapers/repository/__pycache__/crossref.cpython-37.pyc b/pygetpapers/repository/__pycache__/crossref.cpython-37.pyc index 0d31bdc..a96e150 100644 Binary files a/pygetpapers/repository/__pycache__/crossref.cpython-37.pyc and b/pygetpapers/repository/__pycache__/crossref.cpython-37.pyc differ diff --git a/pygetpapers/repository/__pycache__/europe_pmc.cpython-37.pyc b/pygetpapers/repository/__pycache__/europe_pmc.cpython-37.pyc index c96ace0..eaa9fac 100644 Binary files a/pygetpapers/repository/__pycache__/europe_pmc.cpython-37.pyc and b/pygetpapers/repository/__pycache__/europe_pmc.cpython-37.pyc differ diff --git a/pygetpapers/repository/arxiv.py b/pygetpapers/repository/arxiv.py index 0010836..dd7be30 100644 --- a/pygetpapers/repository/arxiv.py +++ b/pygetpapers/repository/arxiv.py @@ -3,7 +3,7 @@ from tqdm import tqdm -import arxiv +import arxiv as arxiv_wrapper from pygetpapers.download_tools import DownloadTools from pygetpapers.pgexceptions import PygetpapersError @@ -50,7 +50,7 @@ from pygetpapers.repositoryinterface import RepositoryInterface class Arxiv(RepositoryInterface): - """Arxiv class which handles arxiv repository""" + """Arxiv class which handles arxiv repository. It uses arxiv repository wrapper to make its query(check https://github.com/lukasschwab/arxiv.py)""" def __init__(self): self.download_tools = DownloadTools(ARXIV) @@ -76,14 +76,14 @@ def arxiv( :rtype: dict """ logging.info("Making request to Arxiv through pygetpapers") - search = arxiv.Search( - query=query, max_results=cutoff_size, sort_by=arxiv.SortCriterion.Relevance + search = arxiv_wrapper.Search( + query=query, max_results=cutoff_size, sort_by=arxiv_wrapper.SortCriterion.Relevance ) - metadata_dictionary = {} logging.info("Got request result from Arxiv through pygetpapers") + search_results = search.get() + metadata_dictionary = self._make_metadata_dict_from_arxiv_output(search_results) - self._make_dict_from_arxiv_output(metadata_dictionary, search) for paper in metadata_dictionary: self.download_tools._add_download_status_keys(paper, metadata_dictionary) if getpdf: @@ -91,11 +91,11 @@ def arxiv( self.download_tools.handle_creation_of_csv_html_xml( makecsv, makehtml, makexml, metadata_dictionary, ARXIV_RESULT ) - self.make_json_from_arxiv_dict(metadata_dictionary) + self.write_metadata_json_from_arxiv_dict(metadata_dictionary) return metadata_dictionary - def make_json_from_arxiv_dict(self, metadata_dictionary): + def write_metadata_json_from_arxiv_dict(self, metadata_dictionary): """Iterates through metadata_dictionary and makes json metadata file for papers :param metadata_dictionary: metadata dictionary for papers @@ -110,8 +110,9 @@ def make_json_from_arxiv_dict(self, metadata_dictionary): self.download_tools.dumps_json_to_given_path(jsonurl, metadata_dictionary[result]) @staticmethod - def _make_dict_from_arxiv_output(metadata_dictionary, search): - for result in search.get(): + def _make_metadata_dict_from_arxiv_output(search_results): + metadata_dictionary = {} + for result in search_results: url_encoded_id_of_paper = str(result.entry_id).rsplit("/", 1)[-1] metadata_dictionary[url_encoded_id_of_paper] = {} @@ -142,6 +143,7 @@ def _make_dict_from_arxiv_output(metadata_dictionary, search): result.pdf_url) paper_dict[ENTRY_ID] = str( result.entry_id) + return metadata_dictionary def download_pdf(self, metadata_dictionary): """Downloads pdfs for papers in metadata dictionary diff --git a/pygetpapers/repository/crossref.py b/pygetpapers/repository/crossref.py index 19c8891..632ee3e 100644 --- a/pygetpapers/repository/crossref.py +++ b/pygetpapers/repository/crossref.py @@ -34,7 +34,7 @@ class CrossRef(RepositoryInterface): - """CrossRef class which handles crossref repository""" + """CrossRef class which handles crossref repository. It uses habanero repository wrapper to make its query""" def __init__(self): diff --git a/pygetpapers/repository/europe_pmc.py b/pygetpapers/repository/europe_pmc.py index fdc4d5a..a2cc522 100644 --- a/pygetpapers/repository/europe_pmc.py +++ b/pygetpapers/repository/europe_pmc.py @@ -2,6 +2,7 @@ import logging import os import time +from numpy import True_ import pandas as pd from tqdm import tqdm @@ -70,7 +71,7 @@ class EuropePmc(RepositoryInterface): def __init__(self): self.download_tools = DownloadTools(EUROPEPMC) - def europepmc(self, query, cutoff_size, synonym=True, cursor_mark="*"): + def query(self, query, cutoff_size, synonym=True, cursor_mark="*"): """Queries eupmc for given query for given number(cutoff_size) papers :param query: query @@ -83,42 +84,42 @@ def europepmc(self, query, cutoff_size, synonym=True, cursor_mark="*"): :rtype: list """ cutoff_size = int(cutoff_size) - cursor_mark=cursor_mark - ( - list_of_papers, - maximum_hits_per_page, - morepapers, - len_list_papers, - ) = self.create_parameters_for_paper_download() + maximum_hits_per_page=1000 + morepapers=True + len_list_papers=0 counter=0 + print(len_list_papers) + print(cutoff_size) while len_list_papers <= cutoff_size and morepapers is True: - retireved_metadata_dictionary = self.build_and_send_query( + retrieved_metadata_dictionary = self.build_and_send_query( maximum_hits_per_page, cursor_mark, query, synonym ) - if retireved_metadata_dictionary: + if retrieved_metadata_dictionary: counter += 1 - totalhits = retireved_metadata_dictionary[RESPONSE_WRAPPER][HITCOUNT] + totalhits = retrieved_metadata_dictionary[RESPONSE_WRAPPER][HITCOUNT] if counter == 1: logging.info("Total Hits are %s", totalhits) if int(totalhits) == 0: logging.warning("Could not find more papers") break - list_of_papers,morepapers = self._add_papers_to_list_of_papers(list_of_papers,retireved_metadata_dictionary) - len_list_papers+=len(list_of_papers) + list_of_paper_metadata,morepapers = self._metadata_dictionary_to_list_of_dictionaries_for_each_paper(retrieved_metadata_dictionary) + len_list_papers+=len(list_of_paper_metadata) morepapers,cursor_mark = self.add_cursor_mark_if_exists( - retireved_metadata_dictionary + retrieved_metadata_dictionary ) - list_of_papers = self.remove_extra_papers_from_list(cutoff_size, list_of_papers) - dictionary_with_papers = self._make_metadata_dictionary_from_list_of_papers(list_of_papers) + list_of_paper_metadata = self.remove_extra_papers_from_list(cutoff_size, list_of_paper_metadata) + dictionary_with_papers = self._make_dictionary_from_list_of_papers(list_of_paper_metadata) + # We use this dictionary_with_papers as a sub dictionary for the metadata_dictionary metadata_dictionary={CURSOR_MARK:cursor_mark,"papers":dictionary_with_papers} return metadata_dictionary - def remove_extra_papers_from_list(self, cutoff_size, list_of_papers): - if len(list_of_papers) > cutoff_size: - list_of_papers = list_of_papers[0:cutoff_size] - return list_of_papers + def remove_extra_papers_from_list(self, cutoff_size, list_of_paper_metadata): + if len(list_of_paper_metadata) > cutoff_size: + list_of_paper_metadata = list_of_paper_metadata[0:cutoff_size] + return list_of_paper_metadata - def _add_papers_to_list_of_papers(self, list_of_papers,retireved_metadata_dictionary): + def _metadata_dictionary_to_list_of_dictionaries_for_each_paper(self,retireved_metadata_dictionary): + list_of_paper_metadata=[] morepapers = True if RESULT in retireved_metadata_dictionary[RESPONSE_WRAPPER][RESULT_LIST]: single_result = isinstance( @@ -126,15 +127,15 @@ def _add_papers_to_list_of_papers(self, list_of_papers,retireved_metadata_dictio ) papers = retireved_metadata_dictionary[RESPONSE_WRAPPER][RESULT_LIST][RESULT] if single_result and PMCID in papers: - list_of_papers.append(papers) + list_of_paper_metadata.append(papers) else: for paper in retireved_metadata_dictionary[RESPONSE_WRAPPER][RESULT_LIST][RESULT]: if PMCID in paper: - list_of_papers.append(paper) + list_of_paper_metadata.append(paper) else: morepapers = False logging.warning("Could not find more papers") - return list_of_papers,morepapers + return list_of_paper_metadata,morepapers def add_cursor_mark_if_exists(self, retireved_metadata_dictionary): @@ -167,12 +168,12 @@ def create_parameters_for_paper_download(): :rtype: [type] """ - list_of_papers = [] + list_of_paper_metadata = [] morepapers = True number_of_papers_there = 0 maximum_hits_per_page = 1000 return ( - list_of_papers, + list_of_paper_metadata, maximum_hits_per_page, morepapers, number_of_papers_there, @@ -261,7 +262,7 @@ def run_eupmc_query_and_get_metadata( cursor_mark= (update[CURSOR_MARK]) else: cursor_mark = "*" - metadata_dictionary = self.europepmc( + metadata_dictionary = self.query( query, cutoff_size, cursor_mark=cursor_mark, synonym=synonym ) self.make_metadata_json( @@ -573,8 +574,6 @@ def add_fields_to_resultant_dict( else: logging.warning("Title not found for paper %s", paper_number) - - def make_metadata_json(self, resultant_dict, update=False): if update: resultant_dict["papers"].update(update["papers"]) @@ -584,9 +583,9 @@ def make_metadata_json(self, resultant_dict, update=False): self.download_tools.dumps_json_to_given_path(jsonurl, resultant_dict) return resultant_dict - def _make_metadata_dictionary_from_list_of_papers(self, list_of_papers): + def _make_dictionary_from_list_of_papers(self, list_of_paper_metadata): resultant_dict = {} - for paper_number, paper in tqdm(enumerate(list_of_papers)): + for paper_number, paper in tqdm(enumerate(list_of_paper_metadata)): paper_number += 1 identifier_for_paper = paper[PMCID] resultant_dict = self.download_tools._make_initial_columns_for_paper_dict( diff --git a/pygetpapers/repositoryinterface.py b/pygetpapers/repositoryinterface.py index ec1087c..86eaea0 100644 --- a/pygetpapers/repositoryinterface.py +++ b/pygetpapers/repositoryinterface.py @@ -18,6 +18,10 @@ RXIV = "rxiv" class RepositoryInterface(ABC): + def __init__(self) -> None: + super().__init__() + self.metadata_dictionary=dict() + @abstractmethod def noexecute(self, query_namespace): """Takes in the query_namespace object as the parameter and runs the query search for given search parameters but only prints the output and not write to disk.