Aggregate-Intellect · amirfz · Feb 10, 2024 · Feb 8, 2024 · Feb 8, 2024 · Feb 8, 2024
diff --git a/src/sherpa_ai/agents/__init__.py b/src/sherpa_ai/agents/__init__.py
@@ -5,6 +5,8 @@
 from sherpa_ai.agents.planner import Planner
 from sherpa_ai.agents.qa_agent import QAAgent
 from sherpa_ai.agents.user import UserAgent
+from sherpa_ai.agents.mathematician import Mathematician
+
 
 __all__ = [
     "AgentPool",
@@ -15,4 +17,5 @@
     "UserAgent",
     "Critic",
     "QAAgent",
+    "Mathematician",
 ]
diff --git a/src/sherpa_ai/output_parsers/__init__.py b/src/sherpa_ai/output_parsers/__init__.py
@@ -1,5 +1,8 @@
 from sherpa_ai.output_parsers.base import BaseOutputParser
+from sherpa_ai.output_parsers.citation_validation import CitationValidation
 from sherpa_ai.output_parsers.link_parse import LinkParser
 from sherpa_ai.output_parsers.md_to_slack_parse import MDToSlackParse
+from sherpa_ai.output_parsers.number_validation import NumberValidation
 
-__all__ = ["BaseOutputParser", "LinkParser", "MDToSlackParse"]
+
+__all__ = ["BaseOutputParser", "LinkParser", "MDToSlackParse", "CitationValidation", "NumberValidation"]
diff --git a/src/sherpa_ai/output_parsers/base.py b/src/sherpa_ai/output_parsers/base.py
@@ -3,14 +3,49 @@
 
 
 class BaseOutputParser(ABC):
+    """
+    An abstract base class for output parsers.
+
+    All concrete output parser classes should inherit from this base class
+    and implement the abstract method 'parse_output'.
+
+    Attributes:
+    - None
+
     @abstractmethod
     def parse_output(self, text: str) -> str:
         pass
 
+    Methods:
+    - parse_output(text: str) -> str:
+        This abstract method must be implemented by subclasses to define
+        the logic for parsing the given text and returning the parsed output.
+
+    Example Usage:
+    ```python
+    class MyOutputParser(BaseOutputParser):
+        def parse_output(self, text: str) -> str:
+            # Implement custom logic to parse the output from 'text'
+            # and return the parsed result.
+            pass
+    ```
+    """
 
-class BaseOutputProcessor(ABC):
     @abstractmethod
     def process_output(self, text: str) -> Tuple[bool, str]:
+        """
+        Parse the output from the given text.
+
+        This method should be implemented by concrete subclasses to define
+        the logic for parsing the output from the provided 'text' and returning
+        the parsed result.
+
+        Parameters:
+        - text (str): The raw text to be parsed.
+
+        Returns:
+        - str: The parsed output.
+        """
         pass
 
     def __call__(self, text: str) -> Tuple[bool, str]:

diff --git a/src/sherpa_ai/output_parsers/citation_validation.py b/src/sherpa_ai/output_parsers/citation_validation.py
@@ -8,13 +8,62 @@
 
 
 class CitationValidation(BaseOutputParser):
+    """
+    A class for adding citations to generated text based on a list of resources.
+
+    This class inherits from the abstract class BaseOutputParser and provides
+    methods to add citations to each sentence in the generated text based on
+    reference texts and links provided in the 'resources' list.
+
+    Attributes:
+    - seq_thresh (float): Threshold for common longest subsequence / text. Default is 0.7.
+    - jaccard_thresh (float): Jaccard similarity threshold. Default is 0.7.
+    - token_overlap (float): Token overlap threshold. Default is 0.7.
+
+    Methods:
+    - calculate_token_overlap(sentence1, sentence2): Calculate token overlap between two sentences.
+    - jaccard_index(sentence1, sentence2): Calculate Jaccard similarity index between two sentences.
+    - longestCommonSubsequence(text1, text2): Calculate the length of the longest common subsequence between two texts.
+    - unfoldList(nestedList): Flatten a nested list of strings.
+    - split_paragraph_into_sentences(paragraph): Tokenize a paragraph into sentences.
+    - parse_output(generated, resources): Add citation to each sentence in the generated text from resources based on fact-checking model.
+
+    Example Usage:
+    ```python
+    citation_parser = CitationValidation(seq_thresh=0.7, jaccard_thresh=0.7, token_overlap=0.7)
+    result = citation_parser.parse_output(generated_text, list_of_resources)
+    ```
+    """
+
     def __init__(self, seq_thresh=0.7, jaccard_thresh=0.7, token_overlap=0.7):
-        # threshold
+        """
+        Initialize the CitationValidation object.
+
+        Args:
+        - seq_thresh (float): Threshold for common longest subsequence / text. Default is 0.7.
+        - jaccard_thresh (float): Jaccard similarity threshold. Default is 0.7.
+        - token_overlap (float): Token overlap threshold. Default is 0.7.
+        """
         self.seq_thresh = seq_thresh  # threshold for common longest subsequece / text
         self.jaccard_thresh = jaccard_thresh
         self.token_overlap = token_overlap
 
-    def calculate_token_overlap(self, sentence1, sentence2):
+    def calculate_token_overlap(self, sentence1, sentence2) -> tuple:
+        """
+        Calculate the percentage of token overlap between two sentences.
+
+        Tokenizes the input sentences and calculates the percentage of token overlap
+        by finding the intersection of the token sets and dividing it by the length
+        of each sentence's token set.
+
+        Args:
+        - sentence1 (str): The first sentence for token overlap calculation.
+        - sentence2 (str): The second sentence for token overlap calculation.
+
+        Returns:
+        - tuple: A tuple containing two float values representing the percentage
+        of token overlap for sentence1 and sentence2, respectively.
+        """
         # Tokenize the sentences
         tokens1 = word_tokenize(sentence1)
         tokens2 = word_tokenize(sentence2)
@@ -32,7 +81,20 @@ def calculate_token_overlap(self, sentence1, sentence2):
             overlap_percentage_2 = len(overlapping_tokens) / (len(tokens2))
         return overlap_percentage, overlap_percentage_2
 
-    def jaccard_index(sself, sentence1, sentence2):
+    def jaccard_index(sself, sentence1, sentence2) -> float:
+        """
+        Calculate the Jaccard index between two sentences.
+
+        The Jaccard index is a measure of similarity between two sets, defined as the
+        size of the intersection divided by the size of the union of the sets.
+
+        Args:
+        - sentence1 (str): The first sentence for Jaccard index calculation.
+        - sentence2 (str): The second sentence for Jaccard index calculation.
+
+        Returns:
+        - float: The Jaccard index representing the similarity between the two sentences.
+        """
         # Convert the sentences to sets of words
         set1 = set(word_tokenize(sentence1))
         set2 = set(word_tokenize(sentence2))
@@ -46,10 +108,20 @@ def jaccard_index(sself, sentence1, sentence2):
         return jaccard_index
 
     def longestCommonSubsequence(self, text1: str, text2: str) -> int:
-        # A subsequence of a string is a new string generated from the
-        # original string with some characters
-        # (can be none) deleted without changing the relative
-        # order of the remaining characters.
+        """
+        Calculate the length of the longest common subsequence between two texts.
+
+        A subsequence of a string is a new string generated from the original
+        string with some characters (can be none) deleted without changing
+        the relative order of the remaining characters.
+
+        Args:
+        - text1 (str): The first text for calculating the longest common subsequence.
+        - text2 (str): The second text for calculating the longest common subsequence.
+
+        Returns:
+        - int: The length of the longest common subsequence between the two texts.
+        """
         dp = [[0 for i in range(len(text1) + 1)] for i in range(len(text2) + 1)]
 
         for i in range(1, len(text2) + 1):
@@ -60,30 +132,63 @@ def longestCommonSubsequence(self, text1: str, text2: str) -> int:
                 dp[i][j] = max(diagnoal, dp[i - 1][j], dp[i][j - 1])
         return dp[-1][-1]
 
-    def unfoldList(self, nestedList: list[list[str]]):
+    def unfoldList(self, nestedList: list[list[str]]) -> list[str]:
+        """
+        Flatten a nested list of strings into a single list of strings.
+
+        Args:
+        - nestedList (list[list[str]]): The nested list of strings to be flattened.
+
+        Returns:
+        - list[str]: A flat list containing all non-empty strings from the nested list.
+        """
         sentences = []
         for sublist in nestedList:
             for item in sublist:
                 if len(item) > 0:
                     sentences.append(item)
         return sentences
 
-    def split_paragraph_into_sentences(self, paragraph):
+    def split_paragraph_into_sentences(self, paragraph: str) -> list[str]:
+        """
+        Tokenize a paragraph into a list of sentences.
+
+        Uses NLTK's sent_tokenize to split a given paragraph into a list of sentences.
+
+        Args:
+        - paragraph (str): The input paragraph to be tokenized into sentences.
+
+        Returns:
+        - list[str]: A list of sentences extracted from the input paragraph.
+        """
         sentences = sent_tokenize(paragraph)
         return sentences
 
     # add citation to the generated text
     def parse_output(self, generated: str, resources: list[dict]) -> ValidationResult:
         """
-        Add citation to each sentence in the generated text from resources based on fact checking model.
+        Add citation to each sentence in the generated text based on fact-checking methdod.
+
         Args:
-            generated (str): The generated content where we need to add citation/reference
-            resources (list[dict]): A list of dictionaries containing reference text and links.
-                Each dictionary in the list should have the format {"Document": str, "Source": str}.
-            activated (bool): control whether we need to add citation or just return the raw generated text.
-                by default it is activated.
+        - generated (str): The generated content where citations/references need to be added.
+        - resources (list[dict]): A list of dictionaries containing reference text and links.
+            Each dictionary in the list should have the format {"Document": str, "Source": str}.
+
         Returns:
-            str: A formatted string combining the citation information from the 'resources' list.
+        - ValidationResult: An object containing the result of citation addition and feedback.
+        The ValidationResult has attributes 'is_valid' indicating success, 'result' containing
+        the formatted text with citations, and 'feedback' providing additional information.
+
+        Note:
+        - The 'resources' list should contain dictionaries with "Document" and "Source" keys.
+
+        Example:
+        ```python
+        resources = [{"Document": "Some reference text.", "Source": "http://example.com/source1"}]
+        citation_parser = CitationValidation()
+        result = citation_parser.parse_output("Generated text.", resources)
+        ```
+
         """
 
         # resources type

diff --git a/src/sherpa_ai/output_parsers/link_parse.py b/src/sherpa_ai/output_parsers/link_parse.py
@@ -6,7 +6,32 @@
 
 
 class LinkParser(BaseOutputParser):
+    """
+    A class for parsing and modifying links in text using specified patterns.
+
+    This class inherits from the abstract class BaseOutputParser and provides
+    methods to parse and modify links in the input text. It includes functionality
+    to replace links with symbols and symbols with links based on predefined patterns.
+
+    Attributes:
+    - links (list): A list to store unique links encountered during parsing.
+    - link_to_id (dict): A dictionary mapping links to their corresponding symbols.
+    - count (int): Counter for generating unique symbols for new links.
+    - output_counter (int): Counter for reindexing output.
+    - reindex_mapping (dict): A mapping of original document IDs to reindexed IDs.
+    - url_pattern (str): Regular expression pattern for identifying links in the input text.
+    - doc_id_pattern (str): Regular expression pattern for identifying document IDs in the input text.
+    - link_symbol (str): Format string for representing link symbols.
+
+    Methods:
+    - parse_output(text: str, tool_output: bool = False) -> str:
+        Parses and modifies links in the input text based on the specified patterns.
+    """
+
     def __init__(self):
+        """
+        Initialize the LinkParser object.
+        """
         self.links = []
         self.link_to_id = {}
         self.count = 1
@@ -21,7 +46,27 @@ def __init__(self):
         self.link_symbol = "[{id}]"
 
     def parse_output(self, text: str, tool_output=False) -> str:
+        """
+        Parses and modifies links in the input text based on the specified patterns.
+
+        Args:
+        - text (str): The input text containing links or symbols to be parsed.
+        - tool_output (bool): A flag indicating whether the input text is tool-generated. Default is False.
+
+        Returns:
+        - str: The modified text with links replaced by symbols or symbols replaced by links.
+        """
+
         def replace_with_symbol(match: re.Match):
+            """
+            Replaces links with symbols in the input text.
+
+            Args:
+            - match (re.Match): A regular expression match object.
+
+            Returns:
+            - str: The modified text with links replaced by symbols.
+            """
             link = match.group(1)
             # check if the link is valid
             if not link.startswith("http"):
@@ -34,6 +79,15 @@ def replace_with_symbol(match: re.Match):
             return "DocID:" + self.link_to_id[link] + "\n"
 
         def replace_with_link(match: re.Match):
+            """
+            Replaces symbols with links in the input text.
+
+            Args:
+            - match (re.Match): A regular expression match object.
+
+            Returns:
+            - str: The modified text with symbols replaced by links.
+            """
             logger.debug(match)
             doc_id = int(match.group(1))
             if doc_id <= 0 or doc_id > len(self.links):

diff --git a/src/sherpa_ai/output_parsers/md_to_slack_parse.py b/src/sherpa_ai/output_parsers/md_to_slack_parse.py
@@ -8,9 +8,42 @@
 
 
 class MDToSlackParse(BaseOutputParser):
+    """
+    A post-processor for converting Markdown links to Slack-compatible format.
+
+    This class inherits from the BaseOutputParser and provides a method to parse
+    and convert Markdown-style links to Slack-compatible format in the input text.
+
+    Attributes:
+    - pattern (str): Regular expression pattern for identifying Markdown links.
+
+    Methods:
+    - parse_output(text: str) -> str:
+        Parses and converts Markdown links to Slack-compatible format in the input text.
+
+    Example Usage:
+    ```python
+    md_to_slack_parser = MDToSlackParse()
+    result = md_to_slack_parser.parse_output("Check out [this link](http://example.com)!")
+    ```
+
+    """
+
     def __init__(self) -> None:
+        """
+        Initialize the MDToSlackParse object with pattern.
+        """
         self.pattern = r"\[([^\]]+)\]\(([^)]+)\)"
 
     def parse_output(self, text: str) -> str:
-        # replace with Slack link
+        """
+        Parses and converts Markdown links to Slack-compatible format in the input text.
+        Replace with Slack link
+
+        Args:
+        - text (str): The input text containing Markdown-style links.
+
+        Returns:
+        - str: The modified text with Markdown links replaced by Slack-compatible links.
+        """
         return re.sub(self.pattern, r"<\2|\1>", text)