Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Docstring #290

Merged
merged 7 commits into from
Feb 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/sherpa_ai/agents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from sherpa_ai.agents.planner import Planner
from sherpa_ai.agents.qa_agent import QAAgent
from sherpa_ai.agents.user import UserAgent
from sherpa_ai.agents.mathematician import Mathematician


__all__ = [
"AgentPool",
Expand All @@ -15,4 +17,5 @@
"UserAgent",
"Critic",
"QAAgent",
"Mathematician",
]
5 changes: 4 additions & 1 deletion src/sherpa_ai/output_parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from sherpa_ai.output_parsers.base import BaseOutputParser
from sherpa_ai.output_parsers.citation_validation import CitationValidation
from sherpa_ai.output_parsers.link_parse import LinkParser
from sherpa_ai.output_parsers.md_to_slack_parse import MDToSlackParse
from sherpa_ai.output_parsers.number_validation import NumberValidation

__all__ = ["BaseOutputParser", "LinkParser", "MDToSlackParse"]

__all__ = ["BaseOutputParser", "LinkParser", "MDToSlackParse", "CitationValidation", "NumberValidation"]
37 changes: 36 additions & 1 deletion src/sherpa_ai/output_parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,49 @@


class BaseOutputParser(ABC):
"""
An abstract base class for output parsers.

All concrete output parser classes should inherit from this base class
and implement the abstract method 'parse_output'.

Attributes:
- None

@abstractmethod
def parse_output(self, text: str) -> str:
pass

Methods:
- parse_output(text: str) -> str:
This abstract method must be implemented by subclasses to define
the logic for parsing the given text and returning the parsed output.

Example Usage:
```python
class MyOutputParser(BaseOutputParser):
def parse_output(self, text: str) -> str:
# Implement custom logic to parse the output from 'text'
# and return the parsed result.
pass
```
"""

class BaseOutputProcessor(ABC):
@abstractmethod
def process_output(self, text: str) -> Tuple[bool, str]:
"""
Parse the output from the given text.

This method should be implemented by concrete subclasses to define
the logic for parsing the output from the provided 'text' and returning
the parsed result.

Parameters:
- text (str): The raw text to be parsed.

Returns:
- str: The parsed output.
"""
pass

def __call__(self, text: str) -> Tuple[bool, str]:
Expand Down
137 changes: 121 additions & 16 deletions src/sherpa_ai/output_parsers/citation_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,62 @@


class CitationValidation(BaseOutputParser):
"""
A class for adding citations to generated text based on a list of resources.

This class inherits from the abstract class BaseOutputParser and provides
methods to add citations to each sentence in the generated text based on
reference texts and links provided in the 'resources' list.

Attributes:
- seq_thresh (float): Threshold for common longest subsequence / text. Default is 0.7.
- jaccard_thresh (float): Jaccard similarity threshold. Default is 0.7.
- token_overlap (float): Token overlap threshold. Default is 0.7.

Methods:
- calculate_token_overlap(sentence1, sentence2): Calculate token overlap between two sentences.
- jaccard_index(sentence1, sentence2): Calculate Jaccard similarity index between two sentences.
- longestCommonSubsequence(text1, text2): Calculate the length of the longest common subsequence between two texts.
- unfoldList(nestedList): Flatten a nested list of strings.
- split_paragraph_into_sentences(paragraph): Tokenize a paragraph into sentences.
- parse_output(generated, resources): Add citation to each sentence in the generated text from resources based on fact-checking model.

Example Usage:
```python
citation_parser = CitationValidation(seq_thresh=0.7, jaccard_thresh=0.7, token_overlap=0.7)
result = citation_parser.parse_output(generated_text, list_of_resources)
```
"""

def __init__(self, seq_thresh=0.7, jaccard_thresh=0.7, token_overlap=0.7):
# threshold
"""
Initialize the CitationValidation object.

Args:
- seq_thresh (float): Threshold for common longest subsequence / text. Default is 0.7.
- jaccard_thresh (float): Jaccard similarity threshold. Default is 0.7.
- token_overlap (float): Token overlap threshold. Default is 0.7.
"""
self.seq_thresh = seq_thresh # threshold for common longest subsequece / text
self.jaccard_thresh = jaccard_thresh
self.token_overlap = token_overlap

def calculate_token_overlap(self, sentence1, sentence2):
def calculate_token_overlap(self, sentence1, sentence2) -> tuple:
"""
Calculate the percentage of token overlap between two sentences.

Tokenizes the input sentences and calculates the percentage of token overlap
by finding the intersection of the token sets and dividing it by the length
of each sentence's token set.

Args:
- sentence1 (str): The first sentence for token overlap calculation.
- sentence2 (str): The second sentence for token overlap calculation.

Returns:
- tuple: A tuple containing two float values representing the percentage
of token overlap for sentence1 and sentence2, respectively.
"""
# Tokenize the sentences
tokens1 = word_tokenize(sentence1)
tokens2 = word_tokenize(sentence2)
Expand All @@ -32,7 +81,20 @@ def calculate_token_overlap(self, sentence1, sentence2):
overlap_percentage_2 = len(overlapping_tokens) / (len(tokens2))
return overlap_percentage, overlap_percentage_2

def jaccard_index(sself, sentence1, sentence2):
def jaccard_index(sself, sentence1, sentence2) -> float:
"""
Calculate the Jaccard index between two sentences.

The Jaccard index is a measure of similarity between two sets, defined as the
size of the intersection divided by the size of the union of the sets.

Args:
- sentence1 (str): The first sentence for Jaccard index calculation.
- sentence2 (str): The second sentence for Jaccard index calculation.

Returns:
- float: The Jaccard index representing the similarity between the two sentences.
"""
# Convert the sentences to sets of words
set1 = set(word_tokenize(sentence1))
set2 = set(word_tokenize(sentence2))
Expand All @@ -46,10 +108,20 @@ def jaccard_index(sself, sentence1, sentence2):
return jaccard_index

def longestCommonSubsequence(self, text1: str, text2: str) -> int:
# A subsequence of a string is a new string generated from the
# original string with some characters
# (can be none) deleted without changing the relative
# order of the remaining characters.
"""
Calculate the length of the longest common subsequence between two texts.

A subsequence of a string is a new string generated from the original
string with some characters (can be none) deleted without changing
the relative order of the remaining characters.

Args:
- text1 (str): The first text for calculating the longest common subsequence.
- text2 (str): The second text for calculating the longest common subsequence.

Returns:
- int: The length of the longest common subsequence between the two texts.
"""
dp = [[0 for i in range(len(text1) + 1)] for i in range(len(text2) + 1)]

for i in range(1, len(text2) + 1):
Expand All @@ -60,30 +132,63 @@ def longestCommonSubsequence(self, text1: str, text2: str) -> int:
dp[i][j] = max(diagnoal, dp[i - 1][j], dp[i][j - 1])
return dp[-1][-1]

def unfoldList(self, nestedList: list[list[str]]):
def unfoldList(self, nestedList: list[list[str]]) -> list[str]:
"""
Flatten a nested list of strings into a single list of strings.

Args:
- nestedList (list[list[str]]): The nested list of strings to be flattened.

Returns:
- list[str]: A flat list containing all non-empty strings from the nested list.
"""
sentences = []
for sublist in nestedList:
for item in sublist:
if len(item) > 0:
sentences.append(item)
return sentences

def split_paragraph_into_sentences(self, paragraph):
def split_paragraph_into_sentences(self, paragraph: str) -> list[str]:
"""
Tokenize a paragraph into a list of sentences.

Uses NLTK's sent_tokenize to split a given paragraph into a list of sentences.

Args:
- paragraph (str): The input paragraph to be tokenized into sentences.

Returns:
- list[str]: A list of sentences extracted from the input paragraph.
"""
sentences = sent_tokenize(paragraph)
return sentences

# add citation to the generated text
def parse_output(self, generated: str, resources: list[dict]) -> ValidationResult:
"""
Add citation to each sentence in the generated text from resources based on fact checking model.
Add citation to each sentence in the generated text based on fact-checking methdod.

Args:
generated (str): The generated content where we need to add citation/reference
resources (list[dict]): A list of dictionaries containing reference text and links.
Each dictionary in the list should have the format {"Document": str, "Source": str}.
activated (bool): control whether we need to add citation or just return the raw generated text.
by default it is activated.
- generated (str): The generated content where citations/references need to be added.
- resources (list[dict]): A list of dictionaries containing reference text and links.
Each dictionary in the list should have the format {"Document": str, "Source": str}.

Returns:
str: A formatted string combining the citation information from the 'resources' list.
- ValidationResult: An object containing the result of citation addition and feedback.
The ValidationResult has attributes 'is_valid' indicating success, 'result' containing
the formatted text with citations, and 'feedback' providing additional information.

Note:
- The 'resources' list should contain dictionaries with "Document" and "Source" keys.

Example:
```python
resources = [{"Document": "Some reference text.", "Source": "http://example.com/source1"}]
citation_parser = CitationValidation()
result = citation_parser.parse_output("Generated text.", resources)
```

"""

# resources type
Expand Down
54 changes: 54 additions & 0 deletions src/sherpa_ai/output_parsers/link_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,32 @@


class LinkParser(BaseOutputParser):
"""
A class for parsing and modifying links in text using specified patterns.

This class inherits from the abstract class BaseOutputParser and provides
methods to parse and modify links in the input text. It includes functionality
to replace links with symbols and symbols with links based on predefined patterns.

Attributes:
- links (list): A list to store unique links encountered during parsing.
- link_to_id (dict): A dictionary mapping links to their corresponding symbols.
- count (int): Counter for generating unique symbols for new links.
- output_counter (int): Counter for reindexing output.
- reindex_mapping (dict): A mapping of original document IDs to reindexed IDs.
- url_pattern (str): Regular expression pattern for identifying links in the input text.
- doc_id_pattern (str): Regular expression pattern for identifying document IDs in the input text.
- link_symbol (str): Format string for representing link symbols.

Methods:
- parse_output(text: str, tool_output: bool = False) -> str:
Parses and modifies links in the input text based on the specified patterns.
"""

def __init__(self):
"""
Initialize the LinkParser object.
"""
self.links = []
self.link_to_id = {}
self.count = 1
Expand All @@ -21,7 +46,27 @@ def __init__(self):
self.link_symbol = "[{id}]"

def parse_output(self, text: str, tool_output=False) -> str:
"""
Parses and modifies links in the input text based on the specified patterns.

Args:
- text (str): The input text containing links or symbols to be parsed.
- tool_output (bool): A flag indicating whether the input text is tool-generated. Default is False.

Returns:
- str: The modified text with links replaced by symbols or symbols replaced by links.
"""

def replace_with_symbol(match: re.Match):
"""
Replaces links with symbols in the input text.

Args:
- match (re.Match): A regular expression match object.

Returns:
- str: The modified text with links replaced by symbols.
"""
link = match.group(1)
# check if the link is valid
if not link.startswith("http"):
Expand All @@ -34,6 +79,15 @@ def replace_with_symbol(match: re.Match):
return "DocID:" + self.link_to_id[link] + "\n"

def replace_with_link(match: re.Match):
"""
Replaces symbols with links in the input text.

Args:
- match (re.Match): A regular expression match object.

Returns:
- str: The modified text with symbols replaced by links.
"""
logger.debug(match)
doc_id = int(match.group(1))
if doc_id <= 0 or doc_id > len(self.links):
Expand Down
35 changes: 34 additions & 1 deletion src/sherpa_ai/output_parsers/md_to_slack_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,42 @@


class MDToSlackParse(BaseOutputParser):
"""
A post-processor for converting Markdown links to Slack-compatible format.

This class inherits from the BaseOutputParser and provides a method to parse
and convert Markdown-style links to Slack-compatible format in the input text.

Attributes:
- pattern (str): Regular expression pattern for identifying Markdown links.

Methods:
- parse_output(text: str) -> str:
Parses and converts Markdown links to Slack-compatible format in the input text.

Example Usage:
```python
md_to_slack_parser = MDToSlackParse()
result = md_to_slack_parser.parse_output("Check out [this link](http://example.com)!")
```

"""

def __init__(self) -> None:
"""
Initialize the MDToSlackParse object with pattern.
"""
self.pattern = r"\[([^\]]+)\]\(([^)]+)\)"

def parse_output(self, text: str) -> str:
# replace with Slack link
"""
Parses and converts Markdown links to Slack-compatible format in the input text.
Replace with Slack link

Args:
- text (str): The input text containing Markdown-style links.

Returns:
- str: The modified text with Markdown links replaced by Slack-compatible links.
"""
return re.sub(self.pattern, r"<\2|\1>", text)
Loading