add function for saving in csv and json format

ScrapeGraphAI · Feb 13, 2024 · ba2af0b · ba2af0b
1 parent 3a77023
commit ba2af0b
Show file tree

Hide file tree

Showing 12 changed files with 155 additions and 46 deletions.
diff --git a/README.md b/README.md
@@ -193,6 +193,10 @@ The oputput format is a dict and its the following:
     }
 ```
 
+# Credits
+Thanks to: 
+- [nicolapiazzalunga](https://github.com/nicolapiazzalunga): for inspiring yosoai/convert_to_csv.py and yosoai/convert_to_json.py functions
+
 # Developed by
 
 <p align="center">

diff --git a/tests/result.csv b/tests/result.csv
@@ -0,0 +1,2 @@
+0,1,2
+1,2,3
diff --git a/tests/result.json b/tests/result.json
@@ -0,0 +1 @@
+{"trial": [1, 2, 3]}
diff --git a/tests/test_convert_to_csv.py b/tests/test_convert_to_csv.py
@@ -0,0 +1,18 @@
+"""Module for testing convert_to_json inside the folder yosoai/convert_to_json.py"""
+import unittest
+from yosoai.convert_to_csv import convert_to_csv
+
+
+class TestConvertToCsvFunction(unittest.TestCase):
+    """ 
+    class for testing convert_to_json inside the folder yosoai/convert_to_json.py
+    """
+
+    def test_get_json(self):
+        """
+        function for testing convert_to_json inside the folder yosoai/convert_to_json.py
+        """
+        example = {"trial": [1, 2, 3]}
+        filename = "result"
+        path = "../YOSO-ai/tests"
+        convert_to_csv(example, filename, path)
diff --git a/tests/test_convert_to_json.py b/tests/test_convert_to_json.py
@@ -0,0 +1,20 @@
+""" 
+Module for testing convert_to_json inside the folder yosoai/convert_to_json.py
+"""
+import unittest
+from yosoai.convert_to_json import convert_to_json
+
+
+class TestConvertToCsvFunction(unittest.TestCase):
+    """ 
+    class for testing convert_to_json inside the folder yosoai/convert_to_json.py
+    """
+
+    def test_get_json(self):
+        """
+        function for testing convert_to_json inside the folder yosoai/convert_to_json.py
+        """
+        example = {"trial": [1, 2, 3]}
+        filename = "result"
+        path = "../YOSO-ai/tests"
+        convert_to_json(example, filename, path)
diff --git a/tests/unit_tests/__init__.py b/tests/unit_tests/__init__.py
diff --git a/tests/unit_tests/test_yosoai.py b/tests/unit_tests/test_yosoai.py
diff --git a/yosoai/__init__.py b/yosoai/__init__.py
@@ -3,6 +3,6 @@
 """
 from .class_creator import create_class
 from .class_generator import Generator
-from .getter import  _get_function
-from .token_calculator import truncate_text_tokens
+from .getter import _get_function
 from .request import send_request
+from .convert_to_csv import convert_to_csv
diff --git a/yosoai/convert_to_csv.py b/yosoai/convert_to_csv.py
@@ -0,0 +1,36 @@
+"""
+Module that given a filename and a position saves the file in the csv format
+"""
+import os
+import pandas as pd
+
+
+def convert_to_csv(data: dict, filename: str, position: str):
+    """
+    Convert data to JSON format and save it to a file.
+
+    Args:
+        data (dict): Data to save.
+        filename (str): Name of the file to save without .csv extension.
+        position (str): Directory where the file should be saved.
+
+    Raises:
+        ValueError: If filename contains '.csv'.
+        FileNotFoundError: If the specified directory does not exist.
+        PermissionError: If the program does not have permission to write to the directory.
+    """
+    if ".csv" in filename:
+        raise ValueError("The filename should not contain '.csv'")
+
+    try:
+        os.makedirs(position, exist_ok=True)
+        pd.DataFrame.from_dict(data, orient='index').to_csv(
+            os.path.join(position, f"{filename}.csv"), index=False)
+    except FileNotFoundError as fnfe:
+        raise FileNotFoundError(
+            f"The specified directory '{position}' does not exist.") from fnfe
+    except PermissionError as pe:
+        raise PermissionError(
+            f"You don't have permission to write to '{position}'.") from pe
+    except Exception as e:
+        raise e
diff --git a/yosoai/convert_to_json.py b/yosoai/convert_to_json.py
@@ -0,0 +1,36 @@
+"""
+Module that given a filename and a position saves the file in the json format
+"""
+import json
+import os
+
+
+def convert_to_json(data: dict, filename: str, position: str):
+    """
+    Convert data to CSV format and save it to a file.
+
+    Args:
+        data (dict): Data to save.
+        filename (str): Name of the file to save without .json extension.
+        position (str): Directory where the file should be saved.
+
+    Raises:
+        ValueError: If filename contains '.json'.
+        FileNotFoundError: If the specified directory does not exist.
+        PermissionError: If the program does not have permission to write to the directory.
+    """
+    if ".json" in filename:
+        raise ValueError("The filename should not contain '.json'")
+
+    try:
+        os.makedirs(position, exist_ok=True)
+        with open(os.path.join(position, f"{filename}.json"), "w", encoding="utf-8") as f:
+            f.write(json.dumps(data))
+    except FileNotFoundError as fnfe:
+        raise FileNotFoundError(
+            f"The specified directory '{position}' does not exist.") from fnfe
+    except PermissionError as pe:
+        raise PermissionError(
+            f"You don't have permission to write to '{position}'.") from pe
+    except Exception as e:
+        raise e
diff --git a/yosoai/json_getter.py b/yosoai/json_getter.py
@@ -41,8 +41,8 @@ def get_json(key: str, link: str, model_name: str,
         prompt = PromptTemplate(
             template=("You are a website scraper and you want to extract information "
                       "in a schema like the example provided. Write a dictionary where "
-                      "the key is the section and the value is the type.\n{format_instructions}\n{query}\n"
-                      ". Example: {example}"),
+                      "the key is the section and the value is the type"
+                      ".\n{format_instructions}\n{query}\n. Example: {example}"),
             input_variables=["query"],
             partial_variables={
                 "format_instructions": parser.get_format_instructions(),

diff --git a/yosoai/token_calculator.py b/yosoai/token_calculator.py
@@ -1,27 +1,42 @@
-"""
-Module for calculating token truncation for text
-"""
+import tiktoken
 from typing import List
-from tiktoken import tokenizer
+
+models_tokens = {
+    "gpt-3.5-turbo-0125": 16385,
+    "gpt-3.5-turbo": 4096,
+    "gpt-3.5-turbo-1106": 16385,
+    "gpt-3.5-turbo-instruct": 4096,
+    "gpt-4-0125-preview": 128000,
+    "gpt-4-turbo-preview": 128000,
+    "gpt-4-1106-preview": 128000,
+    "gpt-4-vision-preview": 128000,
+    "gpt-4": 8192,
+    "gpt-4-0613": 8192,
+    "gpt-4-32k": 32768,
+    "gpt-4-32k-0613": 32768,
+}
 
 
 def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]:
     """
-    Truncates the input text into smaller chunks based on the model's token limit.
+    It creates a list of strings to create max dimension tokenizable elements
+
     Args:
-        text (str): The input text to be truncated.
-        model (str): The name of the language model.
-        encoding_name (str): The name of the encoding to be used.
+        text (str): The input text to be truncated into tokenizable elements.
+        model (str): The name of the language model to be used.
+        encoding_name (str): The name of the encoding to be used (default: EMBEDDING_ENCODING).
+
     Returns:
-        List[str]: A list of truncated text chunks.
+        List[str]: A list of tokenizable elements created from the input text.
     """
-    # Calculate the token limit for the given model and encoding
-    token_limit = tokenizer.token_limit(model, encoding_name)
-    # Truncate the text into smaller chunks based on the token limit
-    chunks = []
-    start = 0
-    while start < len(text):
-        chunk = text[start:start+token_limit]
-        chunks.append(chunk)
-        start += token_limit
-    return chunks
+
+    encoding = tiktoken.get_encoding(encoding_name)
+    max_tokens = models_tokens[model] - 500
+    encoded_text = encoding.encode(text)
+
+    chunks = [encoded_text[i:i + max_tokens]
+              for i in range(0, len(encoded_text), max_tokens)]
+
+    result = [encoding.decode(chunk) for chunk in chunks]
+
+    return result