From 615d0f41a5ff82fcbb799186167735314e86e9dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20Kl=C3=BCber?= Date: Sat, 23 Mar 2024 16:52:47 +0100 Subject: [PATCH] added direct answer --- archive_query_log/orm.py | 43 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/archive_query_log/orm.py b/archive_query_log/orm.py index 9dacd3e..24afe24 100644 --- a/archive_query_log/orm.py +++ b/archive_query_log/orm.py @@ -190,6 +190,18 @@ class Snippet(SnippetId): text: str | None = Text() +class DirectAnswerId(InnerDocument): + id: str = Keyword() + rank: int = Integer() + + +class DirectAnswer(DirectAnswerId): + content: str = Text() + big_box: str | None = Keyword() + small_box: str | None = Text() + right_box: str | None = Text() + + class Serp(BaseDocument): archive: InnerArchive = Object(InnerArchive) provider: InnerProvider = Object(InnerProvider) @@ -208,6 +220,8 @@ class Serp(BaseDocument): warc_query_parser: InnerParser | None = Object(InnerParser) warc_snippets: list[SnippetId] | None = Nested(SnippetId) warc_snippets_parser: InnerParser | None = Object(InnerParser) + warc_direct_answer: list[SnippetId] | None = Nested(SnippetId) + warc_direct_answer_parser: InnerParser | None = Object(InnerParser) # rendered_warc_location: WarcLocation | None = Object(WarcLocation) # rendered_warc_downloader: InnerDownloader | None = ( @@ -437,6 +451,35 @@ class Index: } +WarcDirectAnswerParserType = Literal[ + "xpath", +] + + +class WarcDirectAnswerParser(BaseDocument): + provider: InnerProviderId | None = Object(InnerProviderId) + url_pattern_regex: str | None = Keyword() + priority: float | None = RankFeature(positive_score_impact=True) + parser_type: WarcDirectAnswerParserType = Keyword() + xpath: str | None = Keyword() + url_xpath: str | None = Keyword() + title_xpath: str | None = Keyword() + text_xpath: str | None = Keyword() + + @cached_property + def url_pattern(self) -> Pattern | None: + if self.url_pattern_regex is None: + raise ValueError("No URL pattern regex.") + return pattern(self.url_pattern_regex) + + class Index: + name = "aql_warc_direct_answer_parsers" + settings = { + "number_of_shards": 1, + "number_of_replicas": 2, + } + + WarcMainContentParserType = Literal[ "resiliparse", ]