Skip to content

Commit

Permalink
fixed eupmc
Browse files Browse the repository at this point in the history
  • Loading branch information
ayush4921 committed Mar 8, 2022
1 parent fce5f42 commit 24772ed
Show file tree
Hide file tree
Showing 16 changed files with 445 additions and 572 deletions.
Binary file modified pygetpapers/__pycache__/download_tools.cpython-37.pyc
Binary file not shown.
Binary file modified pygetpapers/__pycache__/pygetpapers.cpython-37.pyc
Binary file not shown.
Binary file modified pygetpapers/__pycache__/repositoryinterface.cpython-37.pyc
Binary file not shown.
95 changes: 55 additions & 40 deletions pygetpapers/download_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import ntpath
import os
import time
from urllib import request
import xml.etree.ElementTree as ET
import zipfile
from time import gmtime, strftime
Expand Down Expand Up @@ -117,7 +118,7 @@ def setup_config_file(self,config_ini):
config.read_string(config_file)
return config

def postquery(self, headers, payload):
def gets_result_dict_for_query(self, headers, data):
"""Queries query_url provided in configuration file for the given headers and payload and returns result in the form of a python dictionary
:param headers: headers given to the request
Expand All @@ -128,18 +129,31 @@ def postquery(self, headers, payload):
:rtype: dictionary
"""
logging.debug("*/RESTful request for fulltext.xml (D)*/")
start = time.time()
request_handler = requests.post(
self.query_url, data=payload, headers=headers)
stop = time.time()
logging.debug("*/Got the Query Result */")
logging.debug("Time elapsed: %s", (stop - start))
request_handler = self.post_query(
self.query_url, data=data, headers=headers)
parser = etree.XMLParser(recover=True)
e= etree.fromstring(request_handler.content, parser=parser)
xmlstr = etree.tostring(e, encoding='utf8', method='xml')
dict_to_return = xmltodict.parse(xmlstr)
return dict_to_return

def post_query(self,url, data=None, headers=None):
"""Queries url
:param headers: headers given to the request
:type headers: dict
:param payload: payload given to the request
:type payload: dict
:return: result in the form of a python dictionary
:rtype: dictionary
"""
start = time.time()
request_handler = requests.post(url,data=data, headers=headers)
stop = time.time()
logging.debug("*/Got the Query Result */")
logging.debug("Time elapsed: %s", (stop - start))
return request_handler

@staticmethod
def check_or_make_directory(directory_url):
"""Makes directory if doesn't already exist
Expand Down Expand Up @@ -241,8 +255,8 @@ def _eupmc_clean_dict_for_csv(paperdict):
return dict_to_write

@staticmethod
def _make_dataframe_for_paper_dict(result, return_dict):
dict_for_df = {k: [v] for k, v in return_dict[result].items()}
def _make_dataframe_for_paper_dict(result, metadata_dictionary):
dict_for_df = {k: [v] for k, v in metadata_dictionary[result].items()}
df_for_paper = pd.DataFrame(dict_for_df)
return df_for_paper

Expand Down Expand Up @@ -445,6 +459,7 @@ def check_if_content_is_zip(self, request_handler):
:return: if zip file exits
:rtype: bool
"""
file_exits=False
for chunk in request_handler.iter_content(chunk_size=128):
if len(chunk) > 0:
file_exits = True
Expand Down Expand Up @@ -491,29 +506,29 @@ def _add_download_status_keys(key_for_dict, resultant_dict):
resultant_dict[key_for_dict][CSVMADE] = False
resultant_dict[key_for_dict][HTMLMADE] = False

def make_csv_for_dict(self, return_dict, name_main_result_file, name_result_file_for_paper):
def make_csv_for_dict(self, metadata_dictionary, name_main_result_file, name_result_file_for_paper):
"""
Writes csv content for the given dictionary to disk
:param return_dict: dictionary to write the content for
:type return_dict: dict
:param metadata_dictionary: dictionary to write the content for
:type metadata_dictionary: dict
:param name_main_result_file: name of the main result file (eg. eupmc-results.xml)
:type name_main_result_file: string
:param name_result_file_for_paper: name of the result file for a paper
:type name_result_file_for_paper: string
"""
logging.info("Making csv files for metadata at %s", os.getcwd())
df = self._get_dataframe_without_additional_pygetpapers_attributes(return_dict)
df = self._get_dataframe_without_additional_pygetpapers_attributes(metadata_dictionary)
self.write_or_append_to_csv(df, name_main_result_file)
self._make_csv_xml_or_html(name_result_file_for_paper,return_dict,makecsv=True)
self._make_csv_xml_or_html(name_result_file_for_paper,metadata_dictionary,makecsv=True)

def _make_csv_xml_or_html(self,name_result_file_for_paper,return_dict,makecsv=False,makexml=False,makehtml=False):
"""Write csv, html or html content for papers in return_dict
def _make_csv_xml_or_html(self,name_result_file_for_paper,metadata_dictionary,makecsv=False,makexml=False,makehtml=False):
"""Write csv, html or html content for papers in metadata_dictionary
:param name_result_file_for_paper: name of the result file for a paper
:type name_result_file_for_paper: string
:param return_dict: Dictionary containing papers
:type return_dict: dict
:param metadata_dictionary: Dictionary containing papers
:type metadata_dictionary: dict
:param makecsv: whether to get csv
:type makecsv: bool
:param makehtml: whether to get html
Expand All @@ -522,7 +537,7 @@ def _make_csv_xml_or_html(self,name_result_file_for_paper,return_dict,makecsv=Fa
:type makexml: bool
"""
paper = 0
dict_to_use = self.removing_added_attributes_from_dictionary(return_dict)
dict_to_use = self.removing_added_attributes_from_dictionary(metadata_dictionary)
for result in tqdm(dict_to_use):
paper += 1
result_encoded = self.url_encode_id(result)
Expand All @@ -533,11 +548,11 @@ def _make_csv_xml_or_html(self,name_result_file_for_paper,return_dict,makecsv=Fa
result, dict_to_use)
if makecsv:
self.write_or_append_to_csv(df_for_paper, url)
return_dict[result][CSVMADE] = True
metadata_dictionary[result][CSVMADE] = True
logging.debug("Wrote csv files for paper %s", paper)
if makehtml:
self.make_html_from_dataframe(df_for_paper, url)
return_dict[result][HTMLMADE] = True
metadata_dictionary[result][HTMLMADE] = True
logging.debug("Wrote html files for paper %s", paper)
if makexml:
total_xml_of_paper = dict2xml(
Expand All @@ -549,49 +564,49 @@ def _make_csv_xml_or_html(self,name_result_file_for_paper,return_dict,makecsv=Fa
file_handler.write(total_xml_of_paper)
logging.debug("Wrote xml files for paper %s", paper)

def make_html_for_dict(self, return_dict, name_main_result_file, name_result_file_for_paper):
def make_html_for_dict(self, metadata_dictionary, name_main_result_file, name_result_file_for_paper):
"""Writes html content for the given dictionary to disk
:param return_dict: dictionary to write the content for
:type return_dict: dict
:param metadata_dictionary: dictionary to write the content for
:type metadata_dictionary: dict
:param name_main_result_file: name of the main result file (eg. eupmc-results.xml)
:type name_main_result_file: string
:param name_result_file_for_paper: name of the result file for a paper
:type name_result_file_for_paper: string
"""
logging.info("Making html files for metadata at %s", os.getcwd())
htmlurl = os.path.join(os.getcwd(), name_main_result_file)
df = self._get_dataframe_without_additional_pygetpapers_attributes(return_dict)
df = self._get_dataframe_without_additional_pygetpapers_attributes(metadata_dictionary)
self.make_html_from_dataframe(df, htmlurl)
self._make_csv_xml_or_html(name_result_file_for_paper,return_dict,makehtml=True)
self._make_csv_xml_or_html(name_result_file_for_paper,metadata_dictionary,makehtml=True)


def _get_dataframe_without_additional_pygetpapers_attributes(self, return_dict):
dict_to_use = self.removing_added_attributes_from_dictionary(return_dict)
def _get_dataframe_without_additional_pygetpapers_attributes(self, metadata_dictionary):
dict_to_use = self.removing_added_attributes_from_dictionary(metadata_dictionary)
df = pd.DataFrame.from_dict(dict_to_use)
return df

def make_xml_for_dict(self, return_dict, name_main_result_file, name_result_file_for_paper):
def make_xml_for_dict(self, metadata_dictionary, name_main_result_file, name_result_file_for_paper):
"""Writes xml content for the given dictionary to disk
:param return_dict: dictionary to write the content for
:type return_dict: dict
:param metadata_dictionary: dictionary to write the content for
:type metadata_dictionary: dict
:param name_main_result_file: name of the main result file (eg. eupmc-results.xml)
:type name_main_result_file: string
:param name_result_file_for_paper: name of the result file for a paper
:type name_result_file_for_paper: string
"""
dict_to_use = self.removing_added_attributes_from_dictionary(return_dict)
dict_to_use = self.removing_added_attributes_from_dictionary(metadata_dictionary)
total_xml = dict2xml(dict_to_use, wrap="root", indent=" ")
logging.info("Making xml files for metadata at %s", os.getcwd())
xmlurl = os.path.join(os.getcwd(), name_main_result_file)
with open(xmlurl, "w", encoding="utf-8") as file_handler:
file_handler.write(total_xml)
paper = 0
self._make_csv_xml_or_html(name_result_file_for_paper,return_dict,paper,makexml=True)
self._make_csv_xml_or_html(name_result_file_for_paper,metadata_dictionary,paper,makexml=True)

def handle_creation_of_csv_html_xml(
self, makecsv, makehtml, makexml, return_dict, name
self, makecsv, makehtml, makexml, metadata_dictionary, name
):
"""Writes csv, html, xml for given conditions
Expand All @@ -601,20 +616,20 @@ def handle_creation_of_csv_html_xml(
:type makehtml: bool
:param makexml: whether to get xml
:type makexml: bool
:param return_dict: dictionary to write the content for
:type return_dict: dict
:param metadata_dictionary: dictionary to write the content for
:type metadata_dictionary: dict
:param name: name of the file to save
:type name: string
"""

if makecsv:
self.make_csv_for_dict(
return_dict, f"{name}s.csv", f"{name}.csv")
metadata_dictionary, f"{name}s.csv", f"{name}.csv")
if makehtml:
self.make_html_for_dict(
return_dict, f"{name}s.html", f"{name}.html")
metadata_dictionary, f"{name}s.html", f"{name}.html")
if makexml:
self.make_xml_for_dict(return_dict, f"{name}s.xml", f"{name}.xml")
self.make_xml_for_dict(metadata_dictionary, f"{name}s.xml", f"{name}.xml")

@staticmethod
def url_encode_id(doi_of_paper):
Expand Down
7 changes: 5 additions & 2 deletions pygetpapers/pygetpapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,16 @@ def _add_date_to_query(self):
else:
self.query_namespace[DATE_OR_NUMBER_OF_PAPERS] = f'{self.query_namespace[STARTDATE]}/{self.query_namespace[ENDDATE]}'

if self.query_namespace[STARTDATE] and self.query_namespace[ENDDATE]:

if self.query_namespace[STARTDATE] and self.query_namespace[ENDDATE] and self.query_namespace[API]==EUROPEPMC:
self.query_namespace[QUERY] = (
f'({self.query_namespace[QUERY]}) AND (FIRST_PDATE:[{self.query_namespace[STARTDATE]} TO {self.query_namespace[ENDDATE]}])'
)
elif self.query_namespace[ENDDATE]:
elif self.query_namespace[ENDDATE] and self.query_namespace[API]==EUROPEPMC:
self.query_namespace[QUERY] = f'({self.query_namespace[QUERY]}) AND (FIRST_PDATE:[TO {self.query_namespace[ENDDATE]}])'

if self.query_namespace[API]==BIORXIV or self.query_namespace[API]==MEDRXIV:
self.query_namespace[QUERY] = self.query_namespace[DATE_OR_NUMBER_OF_PAPERS]
def add_terms_from_file(self):
"""Builds query from terms mentioned in a text file described in the argparse namespace object. See (https://pygetpapers.readthedocs.io/en/latest/index.html?highlight=terms#querying-using-a-term-list)
Edits the namespace object's query flag.
Expand Down
Binary file modified pygetpapers/repository/__pycache__/arxiv.cpython-37.pyc
Binary file not shown.
Binary file modified pygetpapers/repository/__pycache__/crossref.cpython-37.pyc
Binary file not shown.
Binary file modified pygetpapers/repository/__pycache__/europe_pmc.cpython-37.pyc
Binary file not shown.
Binary file modified pygetpapers/repository/__pycache__/rxiv.cpython-37.pyc
Binary file not shown.
Binary file modified pygetpapers/repository/__pycache__/rxivist.cpython-37.pyc
Binary file not shown.
73 changes: 49 additions & 24 deletions pygetpapers/repository/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,44 +56,66 @@ def __init__(self):
self.download_tools = DownloadTools(ARXIV)

def arxiv(
self, query, size, getpdf=False, makecsv=False, makexml=False, makehtml=False
self, query, cutoff_size, getpdf=False, makecsv=False, makexml=False, makehtml=False
):
"""Builds the arxiv searcher and writes the xml, pdf, csv and html
:param query: query given to arxiv
:type query: string
:param cutoff_size: number of papers to retrieve
:type cutoff_size: int
:param getpdf: whether to get pdf
:type getpdf: bool, optional
:param makecsv: whether to get csv
:type makecsv: bool
:param makehtml: whether to get html
:type makehtml: bool
:param makexml: whether to get xml
:type makexml: bool
:return: dictionary of results retrieved from arxiv
:rtype: dict
"""
logging.info("Making request to Arxiv through pygetpapers")
search = arxiv.Search(
query=query, max_results=size, sort_by=arxiv.SortCriterion.Relevance
query=query, max_results=cutoff_size, sort_by=arxiv.SortCriterion.Relevance
)

return_dict = {}
metadata_dictionary = {}
logging.info("Got request result from Arxiv through pygetpapers")

self.make_dict_from_arxiv_output(return_dict, search)
for paper in return_dict:
self.download_tools._add_download_status_keys(paper, return_dict)
self._make_dict_from_arxiv_output(metadata_dictionary, search)
for paper in metadata_dictionary:
self.download_tools._add_download_status_keys(paper, metadata_dictionary)
if getpdf:
self.download_pdf(return_dict)
self.download_pdf(metadata_dictionary)
self.download_tools.handle_creation_of_csv_html_xml(
makecsv, makehtml, makexml, return_dict, ARXIV_RESULT
makecsv, makehtml, makexml, metadata_dictionary, ARXIV_RESULT
)
self.make_json_from_arxiv_dict(return_dict)
self.make_json_from_arxiv_dict(metadata_dictionary)

return return_dict
return metadata_dictionary

def make_json_from_arxiv_dict(self, return_dict):
def make_json_from_arxiv_dict(self, metadata_dictionary):
"""Iterates through metadata_dictionary and makes json metadata file for papers
:param metadata_dictionary: metadata dictionary for papers
:type metadata_dictionary: dict
"""
jsonurl = os.path.join(os.getcwd(), ARXIV_RESULTS_JSON)
self.download_tools.dumps_json_to_given_path(jsonurl, return_dict)
for result in tqdm(return_dict):
return_dict[result][JSONDOWNLOADED] = True
self.download_tools.dumps_json_to_given_path(jsonurl, metadata_dictionary)
for result in tqdm(metadata_dictionary):
metadata_dictionary[result][JSONDOWNLOADED] = True
self.download_tools.check_or_make_directory(result)
jsonurl = os.path.join(os.getcwd(), result, ARXIV_RESULT_JSON)
self.download_tools.dumps_json_to_given_path(jsonurl, return_dict[result])
self.download_tools.dumps_json_to_given_path(jsonurl, metadata_dictionary[result])

@staticmethod
def make_dict_from_arxiv_output(return_dict, search):
def _make_dict_from_arxiv_output(metadata_dictionary, search):
for result in search.get():
url_encoded_id_of_paper = str(result.entry_id).rsplit("/", 1)[-1]

return_dict[url_encoded_id_of_paper] = {}
paper_dict = return_dict[url_encoded_id_of_paper]
metadata_dictionary[url_encoded_id_of_paper] = {}
paper_dict = metadata_dictionary[url_encoded_id_of_paper]
paper_dict[DATE_UPDATED] = str(
result.updated)
paper_dict[DATE_PUBLISHED] = str(
Expand Down Expand Up @@ -121,22 +143,25 @@ def make_dict_from_arxiv_output(return_dict, search):
paper_dict[ENTRY_ID] = str(
result.entry_id)

def download_pdf(self, return_dict):

def download_pdf(self, metadata_dictionary):
"""Downloads pdfs for papers in metadata dictionary
:param metadata_dictionary: metadata dictionary for papers
:type metadata_dictionary: dict
"""
logging.info("Downloading Pdfs for papers")
for result in tqdm(return_dict):
for result in tqdm(metadata_dictionary):
self.download_tools.check_or_make_directory(
os.path.join(os.getcwd(), result)
)
pdf_url = os.path.join(os.getcwd(), result, FULLTEXT_PDF)
self.download_tools.queries_the_url_and_writes_response_to_destination(
return_dict[result][PDF_URL], pdf_url
metadata_dictionary[result][PDF_URL], pdf_url
)
return_dict[result][PDFDOWNLOADED] = True
metadata_dictionary[result][PDFDOWNLOADED] = True

@staticmethod
def noexecute(query_namespace):

logging.info("Arxiv api working for the query %s", query_namespace["query"])

@staticmethod
Expand Down
Loading

0 comments on commit 24772ed

Please sign in to comment.