Skip to content

Commit

Permalink
added direct answer
Browse files Browse the repository at this point in the history
  • Loading branch information
JKlueber committed Mar 23, 2024
1 parent 2cf94c7 commit 615d0f4
Showing 1 changed file with 43 additions and 0 deletions.
43 changes: 43 additions & 0 deletions archive_query_log/orm.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,18 @@ class Snippet(SnippetId):
text: str | None = Text()


class DirectAnswerId(InnerDocument):
id: str = Keyword()
rank: int = Integer()


class DirectAnswer(DirectAnswerId):
content: str = Text()
big_box: str | None = Keyword()
small_box: str | None = Text()
right_box: str | None = Text()


class Serp(BaseDocument):
archive: InnerArchive = Object(InnerArchive)
provider: InnerProvider = Object(InnerProvider)
Expand All @@ -208,6 +220,8 @@ class Serp(BaseDocument):
warc_query_parser: InnerParser | None = Object(InnerParser)
warc_snippets: list[SnippetId] | None = Nested(SnippetId)
warc_snippets_parser: InnerParser | None = Object(InnerParser)
warc_direct_answer: list[SnippetId] | None = Nested(SnippetId)
warc_direct_answer_parser: InnerParser | None = Object(InnerParser)

# rendered_warc_location: WarcLocation | None = Object(WarcLocation)
# rendered_warc_downloader: InnerDownloader | None = (
Expand Down Expand Up @@ -437,6 +451,35 @@ class Index:
}


WarcDirectAnswerParserType = Literal[
"xpath",
]


class WarcDirectAnswerParser(BaseDocument):
provider: InnerProviderId | None = Object(InnerProviderId)
url_pattern_regex: str | None = Keyword()
priority: float | None = RankFeature(positive_score_impact=True)
parser_type: WarcDirectAnswerParserType = Keyword()
xpath: str | None = Keyword()
url_xpath: str | None = Keyword()
title_xpath: str | None = Keyword()
text_xpath: str | None = Keyword()

@cached_property
def url_pattern(self) -> Pattern | None:
if self.url_pattern_regex is None:
raise ValueError("No URL pattern regex.")
return pattern(self.url_pattern_regex)

class Index:
name = "aql_warc_direct_answer_parsers"
settings = {
"number_of_shards": 1,
"number_of_replicas": 2,
}


WarcMainContentParserType = Literal[
"resiliparse",
]
Expand Down

0 comments on commit 615d0f4

Please sign in to comment.