Skip to content

Commit

Permalink
fixed bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
ayush4921 committed Apr 4, 2022
1 parent 5e1666a commit ae79bff
Show file tree
Hide file tree
Showing 15 changed files with 59 additions and 44 deletions.
3 changes: 3 additions & 0 deletions .vs/ProjectSettings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"CurrentProjectSetting": null
}
8 changes: 8 additions & 0 deletions .vs/VSWorkspaceState.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"ExpandedNodes": [
"",
"\\pygetpapers"
],
"SelectedNode": "\\pygetpapers\\pygetpapers.py",
"PreviewInSolutionExplorer": false
}
Binary file added .vs/getpaper/v17/.suo
Binary file not shown.
Binary file added .vs/slnx.sqlite
Binary file not shown.
Binary file modified pygetpapers/__pycache__/pygetpapers.cpython-37.pyc
Binary file not shown.
Binary file modified pygetpapers/__pycache__/repositoryinterface.cpython-37.pyc
Binary file not shown.
2 changes: 1 addition & 1 deletion pygetpapers/config.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[pygetpapers]
version=1.1.7
version=1.1.8

[europe_pmc]
query_url=https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST
Expand Down
1 change: 0 additions & 1 deletion pygetpapers/pygetpapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,6 @@ def add_terms_from_file(self):
elif self.query_namespace[NOTTERMS]:
raise PygetpapersError("Please provide a query with not")


def check_query_logic_and_run(self):
"""Checks the logic in query_namespace and runs pygetpapers for the given query
"""
Expand Down
Binary file modified pygetpapers/repository/__pycache__/arxiv.cpython-37.pyc
Binary file not shown.
Binary file modified pygetpapers/repository/__pycache__/crossref.cpython-37.pyc
Binary file not shown.
Binary file modified pygetpapers/repository/__pycache__/europe_pmc.cpython-37.pyc
Binary file not shown.
22 changes: 12 additions & 10 deletions pygetpapers/repository/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from tqdm import tqdm

import arxiv
import arxiv as arxiv_wrapper
from pygetpapers.download_tools import DownloadTools
from pygetpapers.pgexceptions import PygetpapersError

Expand Down Expand Up @@ -50,7 +50,7 @@
from pygetpapers.repositoryinterface import RepositoryInterface

class Arxiv(RepositoryInterface):
"""Arxiv class which handles arxiv repository"""
"""Arxiv class which handles arxiv repository. It uses arxiv repository wrapper to make its query(check https://github.com/lukasschwab/arxiv.py)"""

def __init__(self):
self.download_tools = DownloadTools(ARXIV)
Expand All @@ -76,26 +76,26 @@ def arxiv(
:rtype: dict
"""
logging.info("Making request to Arxiv through pygetpapers")
search = arxiv.Search(
query=query, max_results=cutoff_size, sort_by=arxiv.SortCriterion.Relevance
search = arxiv_wrapper.Search(
query=query, max_results=cutoff_size, sort_by=arxiv_wrapper.SortCriterion.Relevance
)

metadata_dictionary = {}
logging.info("Got request result from Arxiv through pygetpapers")
search_results = search.get()
metadata_dictionary = self._make_metadata_dict_from_arxiv_output(search_results)

self._make_dict_from_arxiv_output(metadata_dictionary, search)
for paper in metadata_dictionary:
self.download_tools._add_download_status_keys(paper, metadata_dictionary)
if getpdf:
self.download_pdf(metadata_dictionary)
self.download_tools.handle_creation_of_csv_html_xml(
makecsv, makehtml, makexml, metadata_dictionary, ARXIV_RESULT
)
self.make_json_from_arxiv_dict(metadata_dictionary)
self.write_metadata_json_from_arxiv_dict(metadata_dictionary)

return metadata_dictionary

def make_json_from_arxiv_dict(self, metadata_dictionary):
def write_metadata_json_from_arxiv_dict(self, metadata_dictionary):
"""Iterates through metadata_dictionary and makes json metadata file for papers
:param metadata_dictionary: metadata dictionary for papers
Expand All @@ -110,8 +110,9 @@ def make_json_from_arxiv_dict(self, metadata_dictionary):
self.download_tools.dumps_json_to_given_path(jsonurl, metadata_dictionary[result])

@staticmethod
def _make_dict_from_arxiv_output(metadata_dictionary, search):
for result in search.get():
def _make_metadata_dict_from_arxiv_output(search_results):
metadata_dictionary = {}
for result in search_results:
url_encoded_id_of_paper = str(result.entry_id).rsplit("/", 1)[-1]

metadata_dictionary[url_encoded_id_of_paper] = {}
Expand Down Expand Up @@ -142,6 +143,7 @@ def _make_dict_from_arxiv_output(metadata_dictionary, search):
result.pdf_url)
paper_dict[ENTRY_ID] = str(
result.entry_id)
return metadata_dictionary

def download_pdf(self, metadata_dictionary):
"""Downloads pdfs for papers in metadata dictionary
Expand Down
2 changes: 1 addition & 1 deletion pygetpapers/repository/crossref.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@


class CrossRef(RepositoryInterface):
"""CrossRef class which handles crossref repository"""
"""CrossRef class which handles crossref repository. It uses habanero repository wrapper to make its query"""

def __init__(self):

Expand Down
61 changes: 30 additions & 31 deletions pygetpapers/repository/europe_pmc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
import time
from numpy import True_

import pandas as pd
from tqdm import tqdm
Expand Down Expand Up @@ -70,7 +71,7 @@ class EuropePmc(RepositoryInterface):
def __init__(self):
self.download_tools = DownloadTools(EUROPEPMC)

def europepmc(self, query, cutoff_size, synonym=True, cursor_mark="*"):
def query(self, query, cutoff_size, synonym=True, cursor_mark="*"):
"""Queries eupmc for given query for given number(cutoff_size) papers
:param query: query
Expand All @@ -83,58 +84,58 @@ def europepmc(self, query, cutoff_size, synonym=True, cursor_mark="*"):
:rtype: list
"""
cutoff_size = int(cutoff_size)
cursor_mark=cursor_mark
(
list_of_papers,
maximum_hits_per_page,
morepapers,
len_list_papers,
) = self.create_parameters_for_paper_download()
maximum_hits_per_page=1000
morepapers=True
len_list_papers=0
counter=0
print(len_list_papers)
print(cutoff_size)
while len_list_papers <= cutoff_size and morepapers is True:
retireved_metadata_dictionary = self.build_and_send_query(
retrieved_metadata_dictionary = self.build_and_send_query(
maximum_hits_per_page, cursor_mark, query, synonym
)
if retireved_metadata_dictionary:
if retrieved_metadata_dictionary:
counter += 1
totalhits = retireved_metadata_dictionary[RESPONSE_WRAPPER][HITCOUNT]
totalhits = retrieved_metadata_dictionary[RESPONSE_WRAPPER][HITCOUNT]
if counter == 1:
logging.info("Total Hits are %s", totalhits)
if int(totalhits) == 0:
logging.warning("Could not find more papers")
break
list_of_papers,morepapers = self._add_papers_to_list_of_papers(list_of_papers,retireved_metadata_dictionary)
len_list_papers+=len(list_of_papers)
list_of_paper_metadata,morepapers = self._metadata_dictionary_to_list_of_dictionaries_for_each_paper(retrieved_metadata_dictionary)
len_list_papers+=len(list_of_paper_metadata)
morepapers,cursor_mark = self.add_cursor_mark_if_exists(
retireved_metadata_dictionary
retrieved_metadata_dictionary
)
list_of_papers = self.remove_extra_papers_from_list(cutoff_size, list_of_papers)
dictionary_with_papers = self._make_metadata_dictionary_from_list_of_papers(list_of_papers)
list_of_paper_metadata = self.remove_extra_papers_from_list(cutoff_size, list_of_paper_metadata)
dictionary_with_papers = self._make_dictionary_from_list_of_papers(list_of_paper_metadata)
# We use this dictionary_with_papers as a sub dictionary for the metadata_dictionary
metadata_dictionary={CURSOR_MARK:cursor_mark,"papers":dictionary_with_papers}
return metadata_dictionary

def remove_extra_papers_from_list(self, cutoff_size, list_of_papers):
if len(list_of_papers) > cutoff_size:
list_of_papers = list_of_papers[0:cutoff_size]
return list_of_papers
def remove_extra_papers_from_list(self, cutoff_size, list_of_paper_metadata):
if len(list_of_paper_metadata) > cutoff_size:
list_of_paper_metadata = list_of_paper_metadata[0:cutoff_size]
return list_of_paper_metadata

def _add_papers_to_list_of_papers(self, list_of_papers,retireved_metadata_dictionary):
def _metadata_dictionary_to_list_of_dictionaries_for_each_paper(self,retireved_metadata_dictionary):
list_of_paper_metadata=[]
morepapers = True
if RESULT in retireved_metadata_dictionary[RESPONSE_WRAPPER][RESULT_LIST]:
single_result = isinstance(
retireved_metadata_dictionary[RESPONSE_WRAPPER][RESULT_LIST][RESULT],dict
)
papers = retireved_metadata_dictionary[RESPONSE_WRAPPER][RESULT_LIST][RESULT]
if single_result and PMCID in papers:
list_of_papers.append(papers)
list_of_paper_metadata.append(papers)
else:
for paper in retireved_metadata_dictionary[RESPONSE_WRAPPER][RESULT_LIST][RESULT]:
if PMCID in paper:
list_of_papers.append(paper)
list_of_paper_metadata.append(paper)
else:
morepapers = False
logging.warning("Could not find more papers")
return list_of_papers,morepapers
return list_of_paper_metadata,morepapers


def add_cursor_mark_if_exists(self, retireved_metadata_dictionary):
Expand Down Expand Up @@ -167,12 +168,12 @@ def create_parameters_for_paper_download():
:rtype: [type]
"""

list_of_papers = []
list_of_paper_metadata = []
morepapers = True
number_of_papers_there = 0
maximum_hits_per_page = 1000
return (
list_of_papers,
list_of_paper_metadata,
maximum_hits_per_page,
morepapers,
number_of_papers_there,
Expand Down Expand Up @@ -261,7 +262,7 @@ def run_eupmc_query_and_get_metadata(
cursor_mark= (update[CURSOR_MARK])
else:
cursor_mark = "*"
metadata_dictionary = self.europepmc(
metadata_dictionary = self.query(
query, cutoff_size, cursor_mark=cursor_mark, synonym=synonym
)
self.make_metadata_json(
Expand Down Expand Up @@ -573,8 +574,6 @@ def add_fields_to_resultant_dict(
else:
logging.warning("Title not found for paper %s", paper_number)



def make_metadata_json(self, resultant_dict, update=False):
if update:
resultant_dict["papers"].update(update["papers"])
Expand All @@ -584,9 +583,9 @@ def make_metadata_json(self, resultant_dict, update=False):
self.download_tools.dumps_json_to_given_path(jsonurl, resultant_dict)
return resultant_dict

def _make_metadata_dictionary_from_list_of_papers(self, list_of_papers):
def _make_dictionary_from_list_of_papers(self, list_of_paper_metadata):
resultant_dict = {}
for paper_number, paper in tqdm(enumerate(list_of_papers)):
for paper_number, paper in tqdm(enumerate(list_of_paper_metadata)):
paper_number += 1
identifier_for_paper = paper[PMCID]
resultant_dict = self.download_tools._make_initial_columns_for_paper_dict(
Expand Down
4 changes: 4 additions & 0 deletions pygetpapers/repositoryinterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
RXIV = "rxiv"
class RepositoryInterface(ABC):

def __init__(self) -> None:
super().__init__()
self.metadata_dictionary=dict()

@abstractmethod
def noexecute(self, query_namespace):
"""Takes in the query_namespace object as the parameter and runs the query search for given search parameters but only prints the output and not write to disk.
Expand Down

0 comments on commit ae79bff

Please sign in to comment.