fixed bugs

petermr · Apr 4, 2022 · ae79bff · ae79bff
1 parent 5e1666a
commit ae79bff
Show file tree

Hide file tree

Showing 15 changed files with 59 additions and 44 deletions.
diff --git a/.vs/ProjectSettings.json b/.vs/ProjectSettings.json
@@ -0,0 +1,3 @@
+{
+  "CurrentProjectSetting": null
+}
diff --git a/.vs/VSWorkspaceState.json b/.vs/VSWorkspaceState.json
@@ -0,0 +1,8 @@
+{
+  "ExpandedNodes": [
+    "",
+    "\\pygetpapers"
+  ],
+  "SelectedNode": "\\pygetpapers\\pygetpapers.py",
+  "PreviewInSolutionExplorer": false
+}
diff --git a/.vs/getpaper/v17/.suo b/.vs/getpaper/v17/.suo
diff --git a/.vs/slnx.sqlite b/.vs/slnx.sqlite
diff --git a/pygetpapers/__pycache__/pygetpapers.cpython-37.pyc b/pygetpapers/__pycache__/pygetpapers.cpython-37.pyc
diff --git a/pygetpapers/__pycache__/repositoryinterface.cpython-37.pyc b/pygetpapers/__pycache__/repositoryinterface.cpython-37.pyc
diff --git a/pygetpapers/config.ini b/pygetpapers/config.ini
@@ -1,5 +1,5 @@
 [pygetpapers]
-version=1.1.7
+version=1.1.8
 
 [europe_pmc]
 query_url=https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST

diff --git a/pygetpapers/pygetpapers.py b/pygetpapers/pygetpapers.py
@@ -159,7 +159,6 @@ def add_terms_from_file(self):
             elif self.query_namespace[NOTTERMS]:
                 raise PygetpapersError("Please provide a query with not")
 
-
     def check_query_logic_and_run(self):
         """Checks the logic in query_namespace and runs pygetpapers for the given query
         """

diff --git a/pygetpapers/repository/__pycache__/arxiv.cpython-37.pyc b/pygetpapers/repository/__pycache__/arxiv.cpython-37.pyc
diff --git a/pygetpapers/repository/__pycache__/crossref.cpython-37.pyc b/pygetpapers/repository/__pycache__/crossref.cpython-37.pyc
diff --git a/pygetpapers/repository/__pycache__/europe_pmc.cpython-37.pyc b/pygetpapers/repository/__pycache__/europe_pmc.cpython-37.pyc
diff --git a/pygetpapers/repository/arxiv.py b/pygetpapers/repository/arxiv.py
@@ -3,7 +3,7 @@
 
 from tqdm import tqdm
 
-import arxiv
+import arxiv as arxiv_wrapper
 from pygetpapers.download_tools import DownloadTools
 from pygetpapers.pgexceptions import PygetpapersError
 
@@ -50,7 +50,7 @@
 from pygetpapers.repositoryinterface import RepositoryInterface
 
 class Arxiv(RepositoryInterface):
-    """Arxiv class which handles arxiv repository"""
+    """Arxiv class which handles arxiv repository. It uses arxiv repository wrapper to make its query(check https://github.com/lukasschwab/arxiv.py)"""
 
     def __init__(self):
         self.download_tools = DownloadTools(ARXIV)
@@ -76,26 +76,26 @@ def arxiv(
         :rtype: dict
         """
         logging.info("Making request to Arxiv through pygetpapers")
-        search = arxiv.Search(
-            query=query, max_results=cutoff_size, sort_by=arxiv.SortCriterion.Relevance
+        search = arxiv_wrapper.Search(
+            query=query, max_results=cutoff_size, sort_by=arxiv_wrapper.SortCriterion.Relevance
         )
 
-        metadata_dictionary = {}
         logging.info("Got request result from Arxiv through pygetpapers")
+        search_results = search.get()
+        metadata_dictionary = self._make_metadata_dict_from_arxiv_output(search_results)
 
-        self._make_dict_from_arxiv_output(metadata_dictionary, search)
         for paper in metadata_dictionary:
             self.download_tools._add_download_status_keys(paper, metadata_dictionary)
         if getpdf:
             self.download_pdf(metadata_dictionary)
         self.download_tools.handle_creation_of_csv_html_xml(
             makecsv, makehtml, makexml, metadata_dictionary, ARXIV_RESULT
         )
-        self.make_json_from_arxiv_dict(metadata_dictionary)
+        self.write_metadata_json_from_arxiv_dict(metadata_dictionary)
 
         return metadata_dictionary
 
-    def make_json_from_arxiv_dict(self, metadata_dictionary):
+    def write_metadata_json_from_arxiv_dict(self, metadata_dictionary):
         """Iterates through metadata_dictionary and makes json metadata file for papers
 
         :param metadata_dictionary: metadata dictionary for papers
@@ -110,8 +110,9 @@ def make_json_from_arxiv_dict(self, metadata_dictionary):
             self.download_tools.dumps_json_to_given_path(jsonurl, metadata_dictionary[result])
 
     @staticmethod
-    def _make_dict_from_arxiv_output(metadata_dictionary, search):
-        for result in search.get():
+    def _make_metadata_dict_from_arxiv_output(search_results):
+        metadata_dictionary = {}
+        for result in search_results:
             url_encoded_id_of_paper = str(result.entry_id).rsplit("/", 1)[-1]
 
             metadata_dictionary[url_encoded_id_of_paper] = {}
@@ -142,6 +143,7 @@ def _make_dict_from_arxiv_output(metadata_dictionary, search):
                 result.pdf_url)
             paper_dict[ENTRY_ID] = str(
                 result.entry_id)
+        return metadata_dictionary
 
     def download_pdf(self, metadata_dictionary):
         """Downloads pdfs for papers in metadata dictionary

diff --git a/pygetpapers/repository/crossref.py b/pygetpapers/repository/crossref.py
@@ -34,7 +34,7 @@
 
 
 class CrossRef(RepositoryInterface):
-    """CrossRef class which handles crossref repository"""
+    """CrossRef class which handles crossref repository. It uses habanero repository wrapper to make its query"""
 
     def __init__(self):
 

diff --git a/pygetpapers/repository/europe_pmc.py b/pygetpapers/repository/europe_pmc.py
@@ -2,6 +2,7 @@
 import logging
 import os
 import time
+from numpy import True_
 
 import pandas as pd
 from tqdm import tqdm
@@ -70,7 +71,7 @@ class EuropePmc(RepositoryInterface):
     def __init__(self):
         self.download_tools = DownloadTools(EUROPEPMC)
 
-    def europepmc(self, query, cutoff_size, synonym=True, cursor_mark="*"):
+    def query(self, query, cutoff_size, synonym=True, cursor_mark="*"):
         """Queries eupmc for given query for given number(cutoff_size) papers
 
         :param query: query
@@ -83,58 +84,58 @@ def europepmc(self, query, cutoff_size, synonym=True, cursor_mark="*"):
         :rtype: list
         """            
         cutoff_size = int(cutoff_size)
-        cursor_mark=cursor_mark
-        (
-            list_of_papers,
-            maximum_hits_per_page,
-            morepapers,
-            len_list_papers,
-        ) = self.create_parameters_for_paper_download()
+        maximum_hits_per_page=1000
+        morepapers=True
+        len_list_papers=0
         counter=0
+        print(len_list_papers)
+        print(cutoff_size)
         while len_list_papers <= cutoff_size and morepapers is True:
-            retireved_metadata_dictionary = self.build_and_send_query(
+            retrieved_metadata_dictionary = self.build_and_send_query(
                 maximum_hits_per_page, cursor_mark, query, synonym
             )
-            if retireved_metadata_dictionary:
+            if retrieved_metadata_dictionary:
                 counter += 1
-                totalhits = retireved_metadata_dictionary[RESPONSE_WRAPPER][HITCOUNT]
+                totalhits = retrieved_metadata_dictionary[RESPONSE_WRAPPER][HITCOUNT]
                 if counter == 1:
                     logging.info("Total Hits are %s", totalhits)
                 if int(totalhits) == 0:
                     logging.warning("Could not find more papers")
                     break
-                list_of_papers,morepapers = self._add_papers_to_list_of_papers(list_of_papers,retireved_metadata_dictionary)
-                len_list_papers+=len(list_of_papers)
+                list_of_paper_metadata,morepapers = self._metadata_dictionary_to_list_of_dictionaries_for_each_paper(retrieved_metadata_dictionary)
+                len_list_papers+=len(list_of_paper_metadata)
                 morepapers,cursor_mark = self.add_cursor_mark_if_exists(
-                    retireved_metadata_dictionary
+                    retrieved_metadata_dictionary
                 )
-        list_of_papers = self.remove_extra_papers_from_list(cutoff_size, list_of_papers)
-        dictionary_with_papers = self._make_metadata_dictionary_from_list_of_papers(list_of_papers)
+        list_of_paper_metadata = self.remove_extra_papers_from_list(cutoff_size, list_of_paper_metadata)
+        dictionary_with_papers = self._make_dictionary_from_list_of_papers(list_of_paper_metadata)
+        # We use this dictionary_with_papers as a sub dictionary for the metadata_dictionary
         metadata_dictionary={CURSOR_MARK:cursor_mark,"papers":dictionary_with_papers}
         return metadata_dictionary
 
-    def remove_extra_papers_from_list(self, cutoff_size, list_of_papers):
-        if len(list_of_papers) > cutoff_size:
-            list_of_papers = list_of_papers[0:cutoff_size]
-        return list_of_papers
+    def remove_extra_papers_from_list(self, cutoff_size, list_of_paper_metadata):
+        if len(list_of_paper_metadata) > cutoff_size:
+            list_of_paper_metadata = list_of_paper_metadata[0:cutoff_size]
+        return list_of_paper_metadata
 
-    def _add_papers_to_list_of_papers(self, list_of_papers,retireved_metadata_dictionary):
+    def _metadata_dictionary_to_list_of_dictionaries_for_each_paper(self,retireved_metadata_dictionary):
+        list_of_paper_metadata=[]
         morepapers = True
         if RESULT in retireved_metadata_dictionary[RESPONSE_WRAPPER][RESULT_LIST]:
             single_result = isinstance(
             retireved_metadata_dictionary[RESPONSE_WRAPPER][RESULT_LIST][RESULT],dict
             )
             papers = retireved_metadata_dictionary[RESPONSE_WRAPPER][RESULT_LIST][RESULT]
             if single_result and PMCID in papers:
-                list_of_papers.append(papers)
+                list_of_paper_metadata.append(papers)
             else:
                 for paper in retireved_metadata_dictionary[RESPONSE_WRAPPER][RESULT_LIST][RESULT]:
                     if PMCID in paper:
-                        list_of_papers.append(paper)
+                        list_of_paper_metadata.append(paper)
         else:
             morepapers = False
             logging.warning("Could not find more papers")
-        return list_of_papers,morepapers
+        return list_of_paper_metadata,morepapers
 
 
     def add_cursor_mark_if_exists(self, retireved_metadata_dictionary):
@@ -167,12 +168,12 @@ def create_parameters_for_paper_download():
         :rtype: [type]
         """
 
-        list_of_papers = []
+        list_of_paper_metadata = []
         morepapers = True
         number_of_papers_there = 0
         maximum_hits_per_page = 1000
         return (
-            list_of_papers,
+            list_of_paper_metadata,
             maximum_hits_per_page,
             morepapers,
             number_of_papers_there,
@@ -261,7 +262,7 @@ def run_eupmc_query_and_get_metadata(
             cursor_mark= (update[CURSOR_MARK])
         else:
             cursor_mark = "*"
-        metadata_dictionary = self.europepmc(
+        metadata_dictionary = self.query(
             query, cutoff_size, cursor_mark=cursor_mark, synonym=synonym
         )
         self.make_metadata_json(
@@ -573,8 +574,6 @@ def add_fields_to_resultant_dict(
         else:
             logging.warning("Title not found for paper %s", paper_number)
 
-
-
     def make_metadata_json(self, resultant_dict, update=False):
         if update:
             resultant_dict["papers"].update(update["papers"])
@@ -584,9 +583,9 @@ def make_metadata_json(self, resultant_dict, update=False):
         self.download_tools.dumps_json_to_given_path(jsonurl, resultant_dict)
         return resultant_dict
 
-    def _make_metadata_dictionary_from_list_of_papers(self, list_of_papers):
+    def _make_dictionary_from_list_of_papers(self, list_of_paper_metadata):
         resultant_dict = {}
-        for paper_number, paper in tqdm(enumerate(list_of_papers)):
+        for paper_number, paper in tqdm(enumerate(list_of_paper_metadata)):
             paper_number += 1
             identifier_for_paper = paper[PMCID]
             resultant_dict = self.download_tools._make_initial_columns_for_paper_dict(

diff --git a/pygetpapers/repositoryinterface.py b/pygetpapers/repositoryinterface.py
@@ -18,6 +18,10 @@
 RXIV = "rxiv"
 class RepositoryInterface(ABC):
 
+    def __init__(self) -> None:
+        super().__init__()
+        self.metadata_dictionary=dict()
+
     @abstractmethod
     def noexecute(self, query_namespace):
         """Takes in the query_namespace object as the parameter and runs the query search for given search parameters but only prints the output and not write to disk.