fixed eupmc

petermr · Mar 8, 2022 · 24772ed · 24772ed
1 parent fce5f42
commit 24772ed
Show file tree

Hide file tree

Showing 16 changed files with 445 additions and 572 deletions.
diff --git a/pygetpapers/__pycache__/download_tools.cpython-37.pyc b/pygetpapers/__pycache__/download_tools.cpython-37.pyc
diff --git a/pygetpapers/__pycache__/pygetpapers.cpython-37.pyc b/pygetpapers/__pycache__/pygetpapers.cpython-37.pyc
diff --git a/pygetpapers/__pycache__/repositoryinterface.cpython-37.pyc b/pygetpapers/__pycache__/repositoryinterface.cpython-37.pyc
diff --git a/pygetpapers/download_tools.py b/pygetpapers/download_tools.py
@@ -7,6 +7,7 @@
 import ntpath
 import os
 import time
+from urllib import request
 import xml.etree.ElementTree as ET
 import zipfile
 from time import gmtime, strftime
@@ -117,7 +118,7 @@ def setup_config_file(self,config_ini):
         config.read_string(config_file)
         return config
 
-    def postquery(self, headers, payload):
+    def gets_result_dict_for_query(self, headers, data):
         """Queries query_url provided in configuration file for the given headers and payload and returns result in the form of a python dictionary
 
         :param headers: headers given to the request
@@ -128,18 +129,31 @@ def postquery(self, headers, payload):
         :rtype: dictionary
         """
         logging.debug("*/RESTful request for fulltext.xml (D)*/")
-        start = time.time()
-        request_handler = requests.post(
-            self.query_url, data=payload, headers=headers)
-        stop = time.time()
-        logging.debug("*/Got the Query Result */")
-        logging.debug("Time elapsed: %s", (stop - start))
+        request_handler = self.post_query(
+            self.query_url, data=data, headers=headers)
         parser = etree.XMLParser(recover=True)
         e= etree.fromstring(request_handler.content, parser=parser)
         xmlstr = etree.tostring(e, encoding='utf8', method='xml')
         dict_to_return = xmltodict.parse(xmlstr)
         return dict_to_return
 
+    def post_query(self,url, data=None, headers=None):  
+        """Queries url
+
+        :param headers: headers given to the request
+        :type headers: dict
+        :param payload: payload given to the request
+        :type payload: dict
+        :return: result in the form of a python dictionary
+        :rtype: dictionary
+        """      
+        start = time.time()
+        request_handler = requests.post(url,data=data, headers=headers)
+        stop = time.time()
+        logging.debug("*/Got the Query Result */")
+        logging.debug("Time elapsed: %s", (stop - start))
+        return request_handler
+
     @staticmethod
     def check_or_make_directory(directory_url):
         """Makes directory if doesn't already exist
@@ -241,8 +255,8 @@ def _eupmc_clean_dict_for_csv(paperdict):
         return dict_to_write
 
     @staticmethod
-    def _make_dataframe_for_paper_dict(result, return_dict):
-        dict_for_df = {k: [v] for k, v in return_dict[result].items()}
+    def _make_dataframe_for_paper_dict(result, metadata_dictionary):
+        dict_for_df = {k: [v] for k, v in metadata_dictionary[result].items()}
         df_for_paper = pd.DataFrame(dict_for_df)
         return df_for_paper
 
@@ -445,6 +459,7 @@ def check_if_content_is_zip(self, request_handler):
         :return: if zip file exits
         :rtype: bool
         """
+        file_exits=False
         for chunk in request_handler.iter_content(chunk_size=128):
             if len(chunk) > 0:
                 file_exits = True
@@ -491,29 +506,29 @@ def _add_download_status_keys(key_for_dict, resultant_dict):
         resultant_dict[key_for_dict][CSVMADE] = False
         resultant_dict[key_for_dict][HTMLMADE] = False
 
-    def make_csv_for_dict(self, return_dict, name_main_result_file, name_result_file_for_paper):
+    def make_csv_for_dict(self, metadata_dictionary, name_main_result_file, name_result_file_for_paper):
         """
         Writes csv content for the given dictionary to disk 
-        
-        :param return_dict: dictionary to write the content for
-        :type return_dict: dict
+
+        :param metadata_dictionary: dictionary to write the content for
+        :type metadata_dictionary: dict
         :param name_main_result_file: name of the main result file (eg. eupmc-results.xml)
         :type name_main_result_file: string
         :param name_result_file_for_paper: name of the result file for a paper
         :type name_result_file_for_paper: string
         """
         logging.info("Making csv files for metadata at %s", os.getcwd())
-        df = self._get_dataframe_without_additional_pygetpapers_attributes(return_dict)
+        df = self._get_dataframe_without_additional_pygetpapers_attributes(metadata_dictionary)
         self.write_or_append_to_csv(df, name_main_result_file)
-        self._make_csv_xml_or_html(name_result_file_for_paper,return_dict,makecsv=True)
+        self._make_csv_xml_or_html(name_result_file_for_paper,metadata_dictionary,makecsv=True)
 
-    def _make_csv_xml_or_html(self,name_result_file_for_paper,return_dict,makecsv=False,makexml=False,makehtml=False):
-        """Write csv, html or html content for papers in return_dict
+    def _make_csv_xml_or_html(self,name_result_file_for_paper,metadata_dictionary,makecsv=False,makexml=False,makehtml=False):
+        """Write csv, html or html content for papers in metadata_dictionary
 
         :param name_result_file_for_paper: name of the result file for a paper
         :type name_result_file_for_paper: string
-        :param return_dict: Dictionary containing papers
-        :type return_dict: dict
+        :param metadata_dictionary: Dictionary containing papers
+        :type metadata_dictionary: dict
         :param makecsv: whether to get csv 
         :type makecsv: bool
         :param makehtml: whether to get html 
@@ -522,7 +537,7 @@ def _make_csv_xml_or_html(self,name_result_file_for_paper,return_dict,makecsv=Fa
         :type makexml: bool
         """
         paper = 0
-        dict_to_use = self.removing_added_attributes_from_dictionary(return_dict)
+        dict_to_use = self.removing_added_attributes_from_dictionary(metadata_dictionary)
         for result in tqdm(dict_to_use):
             paper += 1
             result_encoded = self.url_encode_id(result)
@@ -533,11 +548,11 @@ def _make_csv_xml_or_html(self,name_result_file_for_paper,return_dict,makecsv=Fa
                 result, dict_to_use)
             if makecsv:
                 self.write_or_append_to_csv(df_for_paper, url)
-                return_dict[result][CSVMADE] = True
+                metadata_dictionary[result][CSVMADE] = True
                 logging.debug("Wrote csv files for paper %s", paper)
             if makehtml:
                 self.make_html_from_dataframe(df_for_paper, url)
-                return_dict[result][HTMLMADE] = True
+                metadata_dictionary[result][HTMLMADE] = True
                 logging.debug("Wrote html files for paper %s", paper)
             if makexml:
                 total_xml_of_paper = dict2xml(
@@ -549,49 +564,49 @@ def _make_csv_xml_or_html(self,name_result_file_for_paper,return_dict,makecsv=Fa
                     file_handler.write(total_xml_of_paper)
                 logging.debug("Wrote xml files for paper %s", paper)
 
-    def make_html_for_dict(self, return_dict, name_main_result_file, name_result_file_for_paper):
+    def make_html_for_dict(self, metadata_dictionary, name_main_result_file, name_result_file_for_paper):
         """Writes html content for the given dictionary to disk 
 
-        :param return_dict: dictionary to write the content for
-        :type return_dict: dict
+        :param metadata_dictionary: dictionary to write the content for
+        :type metadata_dictionary: dict
         :param name_main_result_file: name of the main result file (eg. eupmc-results.xml)
         :type name_main_result_file: string
         :param name_result_file_for_paper: name of the result file for a paper
         :type name_result_file_for_paper: string
         """
         logging.info("Making html files for metadata at %s", os.getcwd())
         htmlurl = os.path.join(os.getcwd(), name_main_result_file)
-        df = self._get_dataframe_without_additional_pygetpapers_attributes(return_dict)
+        df = self._get_dataframe_without_additional_pygetpapers_attributes(metadata_dictionary)
         self.make_html_from_dataframe(df, htmlurl)
-        self._make_csv_xml_or_html(name_result_file_for_paper,return_dict,makehtml=True)
+        self._make_csv_xml_or_html(name_result_file_for_paper,metadata_dictionary,makehtml=True)
 
 
-    def _get_dataframe_without_additional_pygetpapers_attributes(self, return_dict):
-        dict_to_use = self.removing_added_attributes_from_dictionary(return_dict)
+    def _get_dataframe_without_additional_pygetpapers_attributes(self, metadata_dictionary):
+        dict_to_use = self.removing_added_attributes_from_dictionary(metadata_dictionary)
         df = pd.DataFrame.from_dict(dict_to_use)
         return df
 
-    def make_xml_for_dict(self, return_dict, name_main_result_file, name_result_file_for_paper):
+    def make_xml_for_dict(self, metadata_dictionary, name_main_result_file, name_result_file_for_paper):
         """Writes xml content for the given dictionary to disk 
 
-        :param return_dict: dictionary to write the content for
-        :type return_dict: dict
+        :param metadata_dictionary: dictionary to write the content for
+        :type metadata_dictionary: dict
         :param name_main_result_file: name of the main result file (eg. eupmc-results.xml)
         :type name_main_result_file: string
         :param name_result_file_for_paper: name of the result file for a paper
         :type name_result_file_for_paper: string
         """
-        dict_to_use = self.removing_added_attributes_from_dictionary(return_dict)
+        dict_to_use = self.removing_added_attributes_from_dictionary(metadata_dictionary)
         total_xml = dict2xml(dict_to_use, wrap="root", indent="   ")
         logging.info("Making xml files for metadata at %s", os.getcwd())
         xmlurl = os.path.join(os.getcwd(), name_main_result_file)
         with open(xmlurl, "w", encoding="utf-8") as file_handler:
             file_handler.write(total_xml)
         paper = 0
-        self._make_csv_xml_or_html(name_result_file_for_paper,return_dict,paper,makexml=True)
+        self._make_csv_xml_or_html(name_result_file_for_paper,metadata_dictionary,paper,makexml=True)
 
     def handle_creation_of_csv_html_xml(
-        self, makecsv, makehtml, makexml, return_dict, name
+        self, makecsv, makehtml, makexml, metadata_dictionary, name
     ):
         """Writes csv, html, xml for given conditions
 
@@ -601,20 +616,20 @@ def handle_creation_of_csv_html_xml(
         :type makehtml: bool
         :param makexml: whether to get xml 
         :type makexml: bool
-        :param return_dict: dictionary to write the content for
-        :type return_dict: dict
+        :param metadata_dictionary: dictionary to write the content for
+        :type metadata_dictionary: dict
         :param name: name of the file to save
         :type name: string
         """
 
         if makecsv:
             self.make_csv_for_dict(
-               return_dict, f"{name}s.csv", f"{name}.csv")
+               metadata_dictionary, f"{name}s.csv", f"{name}.csv")
         if makehtml:
             self.make_html_for_dict(
-                return_dict, f"{name}s.html", f"{name}.html")
+                metadata_dictionary, f"{name}s.html", f"{name}.html")
         if makexml:
-            self.make_xml_for_dict(return_dict, f"{name}s.xml", f"{name}.xml")
+            self.make_xml_for_dict(metadata_dictionary, f"{name}s.xml", f"{name}.xml")
 
     @staticmethod
     def url_encode_id(doi_of_paper):

diff --git a/pygetpapers/pygetpapers.py b/pygetpapers/pygetpapers.py
@@ -115,13 +115,16 @@ def _add_date_to_query(self):
         else:
             self.query_namespace[DATE_OR_NUMBER_OF_PAPERS] = f'{self.query_namespace[STARTDATE]}/{self.query_namespace[ENDDATE]}'
 
-        if self.query_namespace[STARTDATE] and self.query_namespace[ENDDATE]:
+
+        if self.query_namespace[STARTDATE] and self.query_namespace[ENDDATE] and self.query_namespace[API]==EUROPEPMC:
             self.query_namespace[QUERY] = (
                 f'({self.query_namespace[QUERY]}) AND (FIRST_PDATE:[{self.query_namespace[STARTDATE]} TO {self.query_namespace[ENDDATE]}])'
             )
-        elif self.query_namespace[ENDDATE]:
+        elif self.query_namespace[ENDDATE] and self.query_namespace[API]==EUROPEPMC:
             self.query_namespace[QUERY] = f'({self.query_namespace[QUERY]}) AND (FIRST_PDATE:[TO {self.query_namespace[ENDDATE]}])'
 
+        if self.query_namespace[API]==BIORXIV or self.query_namespace[API]==MEDRXIV:
+            self.query_namespace[QUERY] = self.query_namespace[DATE_OR_NUMBER_OF_PAPERS]
     def add_terms_from_file(self):
         """Builds query from terms mentioned in a text file described in the argparse namespace object. See (https://pygetpapers.readthedocs.io/en/latest/index.html?highlight=terms#querying-using-a-term-list)
         Edits the namespace object's query flag.

diff --git a/pygetpapers/repository/__pycache__/arxiv.cpython-37.pyc b/pygetpapers/repository/__pycache__/arxiv.cpython-37.pyc
diff --git a/pygetpapers/repository/__pycache__/crossref.cpython-37.pyc b/pygetpapers/repository/__pycache__/crossref.cpython-37.pyc
diff --git a/pygetpapers/repository/__pycache__/europe_pmc.cpython-37.pyc b/pygetpapers/repository/__pycache__/europe_pmc.cpython-37.pyc
diff --git a/pygetpapers/repository/__pycache__/rxiv.cpython-37.pyc b/pygetpapers/repository/__pycache__/rxiv.cpython-37.pyc
diff --git a/pygetpapers/repository/__pycache__/rxivist.cpython-37.pyc b/pygetpapers/repository/__pycache__/rxivist.cpython-37.pyc
diff --git a/pygetpapers/repository/arxiv.py b/pygetpapers/repository/arxiv.py
@@ -56,44 +56,66 @@ def __init__(self):
         self.download_tools = DownloadTools(ARXIV)
 
     def arxiv(
-            self, query, size, getpdf=False, makecsv=False, makexml=False, makehtml=False
+            self, query, cutoff_size, getpdf=False, makecsv=False, makexml=False, makehtml=False
     ):
+        """Builds the arxiv searcher and writes the xml, pdf, csv and html
+
+        :param query: query given to arxiv
+        :type query: string
+        :param cutoff_size: number of papers to retrieve
+        :type cutoff_size: int
+        :param getpdf: whether to get pdf
+        :type getpdf: bool, optional
+        :param makecsv: whether to get csv 
+        :type makecsv: bool
+        :param makehtml: whether to get html 
+        :type makehtml: bool
+        :param makexml: whether to get xml 
+        :type makexml: bool
+        :return: dictionary of results retrieved from arxiv
+        :rtype: dict
+        """
         logging.info("Making request to Arxiv through pygetpapers")
         search = arxiv.Search(
-            query=query, max_results=size, sort_by=arxiv.SortCriterion.Relevance
+            query=query, max_results=cutoff_size, sort_by=arxiv.SortCriterion.Relevance
         )
 
-        return_dict = {}
+        metadata_dictionary = {}
         logging.info("Got request result from Arxiv through pygetpapers")
 
-        self.make_dict_from_arxiv_output(return_dict, search)
-        for paper in return_dict:
-            self.download_tools._add_download_status_keys(paper, return_dict)
+        self._make_dict_from_arxiv_output(metadata_dictionary, search)
+        for paper in metadata_dictionary:
+            self.download_tools._add_download_status_keys(paper, metadata_dictionary)
         if getpdf:
-            self.download_pdf(return_dict)
+            self.download_pdf(metadata_dictionary)
         self.download_tools.handle_creation_of_csv_html_xml(
-            makecsv, makehtml, makexml, return_dict, ARXIV_RESULT
+            makecsv, makehtml, makexml, metadata_dictionary, ARXIV_RESULT
         )
-        self.make_json_from_arxiv_dict(return_dict)
+        self.make_json_from_arxiv_dict(metadata_dictionary)
 
-        return return_dict
+        return metadata_dictionary
 
-    def make_json_from_arxiv_dict(self, return_dict):
+    def make_json_from_arxiv_dict(self, metadata_dictionary):
+        """Iterates through metadata_dictionary and makes json metadata file for papers
+
+        :param metadata_dictionary: metadata dictionary for papers
+        :type metadata_dictionary: dict
+        """
         jsonurl = os.path.join(os.getcwd(), ARXIV_RESULTS_JSON)
-        self.download_tools.dumps_json_to_given_path(jsonurl, return_dict)
-        for result in tqdm(return_dict):
-            return_dict[result][JSONDOWNLOADED] = True
+        self.download_tools.dumps_json_to_given_path(jsonurl, metadata_dictionary)
+        for result in tqdm(metadata_dictionary):
+            metadata_dictionary[result][JSONDOWNLOADED] = True
             self.download_tools.check_or_make_directory(result)
             jsonurl = os.path.join(os.getcwd(), result, ARXIV_RESULT_JSON)
-            self.download_tools.dumps_json_to_given_path(jsonurl, return_dict[result])
+            self.download_tools.dumps_json_to_given_path(jsonurl, metadata_dictionary[result])
 
     @staticmethod
-    def make_dict_from_arxiv_output(return_dict, search):
+    def _make_dict_from_arxiv_output(metadata_dictionary, search):
         for result in search.get():
             url_encoded_id_of_paper = str(result.entry_id).rsplit("/", 1)[-1]
 
-            return_dict[url_encoded_id_of_paper] = {}
-            paper_dict = return_dict[url_encoded_id_of_paper]
+            metadata_dictionary[url_encoded_id_of_paper] = {}
+            paper_dict = metadata_dictionary[url_encoded_id_of_paper]
             paper_dict[DATE_UPDATED] = str(
                 result.updated)
             paper_dict[DATE_PUBLISHED] = str(
@@ -121,22 +143,25 @@ def make_dict_from_arxiv_output(return_dict, search):
             paper_dict[ENTRY_ID] = str(
                 result.entry_id)
 
-    def download_pdf(self, return_dict):
-
+    def download_pdf(self, metadata_dictionary):
+        """Downloads pdfs for papers in metadata dictionary
+
+        :param metadata_dictionary: metadata dictionary for papers
+        :type metadata_dictionary: dict
+        """
         logging.info("Downloading Pdfs for papers")
-        for result in tqdm(return_dict):
+        for result in tqdm(metadata_dictionary):
             self.download_tools.check_or_make_directory(
                 os.path.join(os.getcwd(), result)
             )
             pdf_url = os.path.join(os.getcwd(), result, FULLTEXT_PDF)
             self.download_tools.queries_the_url_and_writes_response_to_destination(
-                return_dict[result][PDF_URL], pdf_url
+                metadata_dictionary[result][PDF_URL], pdf_url
             )
-            return_dict[result][PDFDOWNLOADED] = True
+            metadata_dictionary[result][PDFDOWNLOADED] = True
 
     @staticmethod
     def noexecute(query_namespace):
-
         logging.info("Arxiv api working for the query %s", query_namespace["query"])
 
     @staticmethod