From a1c53d30635719136bb54d11f06af5edfcf17b05 Mon Sep 17 00:00:00 2001 From: James Braza Date: Fri, 17 Jan 2025 16:56:44 -0800 Subject: [PATCH] Documenting citation style (#817) --- paperqa/docs.py | 23 ++++------------------- paperqa/settings.py | 1 + paperqa/utils.py | 20 ++++++++++++++++++++ 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/paperqa/docs.py b/paperqa/docs.py index ff16c204..5f26550e 100644 --- a/paperqa/docs.py +++ b/paperqa/docs.py @@ -42,6 +42,7 @@ from paperqa.settings import MaybeSettings, get_settings from paperqa.types import Doc, DocDetails, DocKey, PQASession, Text from paperqa.utils import ( + citation_to_docname, gather_with_concurrency, get_loop, maybe_is_html, @@ -306,23 +307,7 @@ async def aadd( # noqa: PLR0912 ): citation = f"Unknown, {os.path.basename(path)}, {datetime.now().year}" - if docname is None: - # get first name and year from citation - match = re.search(r"([A-Z][a-z]+)", citation) - if match is not None: - author = match.group(1) - else: - # panicking - no word?? - raise ValueError( - f"Could not parse docname from citation {citation}. " - "Consider just passing key explicitly - e.g. docs.py " - "(path, citation, key='mykey')" - ) - year = "" - match = re.search(r"(\d{4})", citation) - if match is not None: - year = match.group(1) - docname = f"{author}{year}" + docname = citation_to_docname(citation) if docname is None else docname docname = self._get_unique_name(docname) doc = Doc(docname=docname, citation=citation, dockey=dockey) @@ -801,8 +786,8 @@ async def aquery( # noqa: PLR0912 answer_text = answer_result.text session.add_tokens(answer_result) # it still happens - if prompt_config.EXAMPLE_CITATION in answer_text: - answer_text = answer_text.replace(prompt_config.EXAMPLE_CITATION, "") + if (ex_citation := prompt_config.EXAMPLE_CITATION) in answer_text: + answer_text = answer_text.replace(ex_citation, "") for c in filtered_contexts: name = c.text.name citation = c.text.doc.formatted_citation diff --git a/paperqa/settings.py b/paperqa/settings.py index 4939e5ff..b90aa761 100644 --- a/paperqa/settings.py +++ b/paperqa/settings.py @@ -259,6 +259,7 @@ def get_formatted_variables(s: str) -> set[str]: class PromptSettings(BaseModel): model_config = ConfigDict(extra="forbid", validate_assignment=True) + # MLA parenthetical in-text citation, SEE: https://nwtc.libguides.com/citations/MLA#s-lg-box-707489 EXAMPLE_CITATION: ClassVar[str] = "(Example2012Example pages 3-4)" summary: str = summary_prompt diff --git a/paperqa/utils.py b/paperqa/utils.py index 06f80408..be2491ab 100644 --- a/paperqa/utils.py +++ b/paperqa/utils.py @@ -544,3 +544,23 @@ def logging_filters( log_with_filter = logging.getLogger(logger_name) for log_filter_to_remove in log_filters_to_remove: log_with_filter.removeFilter(log_filter_to_remove) + + +def citation_to_docname(citation: str) -> str: + """Create a docname that follows MLA parenthetical in-text citation.""" + # get first name and year from citation + match = re.search(r"([A-Z][a-z]+)", citation) + if match is not None: + author = match.group(1) + else: + # panicking - no word?? + raise ValueError( + f"Could not parse docname from citation {citation}. " + "Consider just passing key explicitly - e.g. docs.py " + "(path, citation, key='mykey')" + ) + year = "" + match = re.search(r"(\d{4})", citation) + if match is not None: + year = match.group(1) + return f"{author}{year}"