From 50c7ed57d5387d7725226a13dde2bd651d5e7987 Mon Sep 17 00:00:00 2001
From: Saurav Panda <sgp65@cornell.edu>
Date: Sun, 27 Oct 2024 16:23:25 -0700
Subject: [PATCH] refc: removed rafigy to reduce load size of package

---
 examples/ragify_codebase/main.py          |  32 --
 kaizen/retriever/__init__.py              |   0
 kaizen/retriever/code_chunker.py          | 166 ---------
 kaizen/retriever/feedback_system.py       |  18 -
 kaizen/retriever/llama_index_retriever.py | 407 ----------------------
 kaizen/retriever/qdrant_vector_store.py   |  64 ----
 kaizen/retriever/tree_sitter_utils.py     | 131 -------
 kaizen/tests/retriever/test_chunker.py    | 101 ------
 pyproject.toml                            |   2 +-
 9 files changed, 1 insertion(+), 920 deletions(-)
 delete mode 100644 examples/ragify_codebase/main.py
 delete mode 100644 kaizen/retriever/__init__.py
 delete mode 100644 kaizen/retriever/code_chunker.py
 delete mode 100644 kaizen/retriever/feedback_system.py
 delete mode 100644 kaizen/retriever/llama_index_retriever.py
 delete mode 100644 kaizen/retriever/qdrant_vector_store.py
 delete mode 100644 kaizen/retriever/tree_sitter_utils.py
 delete mode 100644 kaizen/tests/retriever/test_chunker.py

diff --git a/examples/ragify_codebase/main.py b/examples/ragify_codebase/main.py
deleted file mode 100644
index dcff7207..00000000
--- a/examples/ragify_codebase/main.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from kaizen.retriever.llama_index_retriever import RepositoryAnalyzer
-
-# Initialize the analyzer
-analyzer = RepositoryAnalyzer()
-
-# Set up the repository (do this when you first analyze a repo or when you want to update it)
-analyzer.setup_repository("./github_app/")
-
-# Perform queries (you can do this as many times as you want without calling setup_repository again)
-results = analyzer.query("jwt token generation")
-for result in results:
-    print(f"File: {result['file_path']}")
-    # print(f"Abstraction: {result['abstraction']}")
-    # print(f"result:\n{result}")
-    print(f"Relevance Score: {result['relevance_score']}")
-    print("---")
-
-print("....... \n\n")
-
-results = analyzer.query("How do you filter the results?")
-for result in results:
-    print(f"File: {result['file_path']}")
-    # print(f"Abstraction: {result['abstraction']}")
-    # print(f"result:\n{result}")
-    print(f"Relevance Score: {result['relevance_score']}")
-    print("---")
-
-# # If you make changes to the repository and want to update the analysis:
-# analyzer.setup_repository("/path/to/your/repo")
-
-# Then you can query again with the updated data
-# results = analyzer.query("authentication")
diff --git a/kaizen/retriever/__init__.py b/kaizen/retriever/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/kaizen/retriever/code_chunker.py b/kaizen/retriever/code_chunker.py
deleted file mode 100644
index 49b0e94e..00000000
--- a/kaizen/retriever/code_chunker.py
+++ /dev/null
@@ -1,166 +0,0 @@
-from typing import Dict, Any
-from kaizen.retriever.tree_sitter_utils import parse_code, ParserFactory
-import os
-
-ParsedBody = Dict[str, Dict[str, Any]]
-
-
-def chunk_code(code: str, language: str) -> ParsedBody:
-    parser = ParserFactory.get_parser(language)
-    tree = parser.parse(code.encode("utf8"))
-    code_bytes = code.encode("utf8")
-    body: ParsedBody = {
-        "imports": [],
-        "global_variables": [],
-        "type_definitions": [],
-        "functions": {},
-        "async_functions": {},
-        "classes": {},
-        "hooks": {},
-        "components": {},
-        "jsx_elements": [],
-        "other_blocks": [],
-    }
-
-    def process_node(node):
-        result = parse_code(node, code_bytes)
-        if result:
-            start_line = result.get("start_line", 0)
-            end_line = result.get("end_line", 0)
-
-            if result["type"] == "import_statement":
-                body["imports"].append(
-                    {
-                        "code": result["code"],
-                        "start_line": start_line,
-                        "end_line": end_line,
-                    }
-                )
-            elif (
-                result["type"] == "variable_declaration"
-                and node.parent.type == "program"
-            ):
-                body["global_variables"].append(
-                    {
-                        "code": result["code"],
-                        "start_line": start_line,
-                        "end_line": end_line,
-                    }
-                )
-            elif result["type"] in ["type_alias", "interface_declaration"]:
-                body["type_definitions"].append(
-                    {
-                        "name": result["name"],
-                        "code": result["code"],
-                        "start_line": start_line,
-                        "end_line": end_line,
-                    }
-                )
-            elif result["type"] == "function":
-                if is_react_hook(result["name"]):
-                    body["hooks"][result["name"]] = {
-                        "code": result["code"],
-                        "start_line": start_line,
-                        "end_line": end_line,
-                    }
-                elif is_react_component(result["code"]):
-                    body["components"][result["name"]] = {
-                        "code": result["code"],
-                        "start_line": start_line,
-                        "end_line": end_line,
-                    }
-                elif "async" in result["code"].split()[0]:
-                    body["async_functions"][result["name"]] = {
-                        "code": result["code"],
-                        "start_line": start_line,
-                        "end_line": end_line,
-                    }
-                else:
-                    body["functions"][result["name"]] = {
-                        "code": result["code"],
-                        "start_line": start_line,
-                        "end_line": end_line,
-                    }
-            elif result["type"] == "class":
-                if is_react_component(result["code"]):
-                    body["components"][result["name"]] = {
-                        "code": result["code"],
-                        "start_line": start_line,
-                        "end_line": end_line,
-                    }
-                else:
-                    body["classes"][result["name"]] = {
-                        "code": result["code"],
-                        "start_line": start_line,
-                        "end_line": end_line,
-                    }
-            elif result["type"] == "jsx_element":
-                body["jsx_elements"].append(
-                    {
-                        "code": result["code"],
-                        "start_line": start_line,
-                        "end_line": end_line,
-                    }
-                )
-        else:
-            for child in node.children:
-                process_node(child)
-
-    process_node(tree.root_node)
-
-    # Collect remaining code as other_blocks
-    collected_ranges = []
-    for section in body.values():
-        if isinstance(section, dict):
-            for code_block in section.values():
-                collected_ranges.append(
-                    (code_block["start_line"], code_block["end_line"])
-                )
-        elif isinstance(section, list):
-            for code_block in section:
-                collected_ranges.append(
-                    (code_block["start_line"], code_block["end_line"])
-                )
-
-    collected_ranges.sort()
-    last_end = 0
-    for start, end in collected_ranges:
-        if start > last_end:
-            body["other_blocks"].append(code[last_end:start].strip())
-        last_end = end
-    if last_end < len(code):
-        body["other_blocks"].append(code[last_end:].strip())
-
-    return body
-
-
-def is_react_hook(name: str) -> bool:
-    return name.startswith("use") and len(name) > 3 and name[3].isupper()
-
-
-def is_react_component(code: str) -> bool:
-    return (
-        "React" in code
-        or "jsx" in code.lower()
-        or "tsx" in code.lower()
-        or "<" in code
-        or "props" in code
-        or "render" in code
-    )
-
-
-def clean_filename(filepath):
-    # Split the path into components
-    path_components = filepath.split(os.sep)
-
-    # Find the index of 'tmp' in the path
-    try:
-        tmp_index = path_components.index("tmp")
-    except ValueError:
-        # If 'tmp' is not found, return the original filepath
-        return filepath
-
-    # Join the components after 'tmp' to create the cleaned filename
-    cleaned_filename = os.sep.join(path_components[tmp_index + 2 :])
-
-    return cleaned_filename
diff --git a/kaizen/retriever/feedback_system.py b/kaizen/retriever/feedback_system.py
deleted file mode 100644
index 8c47a1ec..00000000
--- a/kaizen/retriever/feedback_system.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from typing import Dict, Any
-
-
-class AbstractionFeedback:
-    def __init__(self):
-        self.feedback_store: Dict[str, Dict[str, Any]] = {}
-
-    def add_feedback(
-        self, code_id: str, abstraction: str, rating: int, correction: str = None
-    ) -> None:
-        self.feedback_store[code_id] = {
-            "abstraction": abstraction,
-            "rating": rating,
-            "correction": correction,
-        }
-
-    def get_feedback(self, code_id: str) -> Dict[str, Any]:
-        return self.feedback_store.get(code_id, None)
diff --git a/kaizen/retriever/llama_index_retriever.py b/kaizen/retriever/llama_index_retriever.py
deleted file mode 100644
index 29b814b5..00000000
--- a/kaizen/retriever/llama_index_retriever.py
+++ /dev/null
@@ -1,407 +0,0 @@
-import os
-import logging
-from uuid import uuid4
-from llama_index.llms.litellm import LiteLLM
-import networkx as nx
-from typing import List, Dict, Any
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import tiktoken
-from kaizen.llms.provider import LLMProvider
-from kaizen.retriever.code_chunker import chunk_code, clean_filename
-import traceback
-from llama_index.embeddings.litellm import LiteLLMEmbedding
-from sqlalchemy import create_engine, text
-from kaizen.retriever.qdrant_vector_store import QdrantVectorStore
-import json
-
-# Set up logging
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
-logger = logging.getLogger(__name__)
-
-# Initialize tokenizer
-tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
-
-
-class RepositoryAnalyzer:
-    def __init__(self, repo_id=1):
-        logger.info("Initializing RepositoryAnalyzer")
-        self.engine = create_engine(
-            f"postgresql://{os.environ['POSTGRES_USER']}:{os.environ['POSTGRES_PASSWORD']}@{os.environ['POSTGRES_HOST']}:{os.environ['POSTGRES_PORT']}/{os.environ['POSTGRES_DB']}",
-            pool_size=10,
-            max_overflow=20,
-        )
-        self.repo_id = repo_id
-        self.graph = nx.DiGraph()
-        self.vector_store = QdrantVectorStore("embeddings", vector_size=1536)
-        self.llm_provider = LLMProvider()
-        self.llm = LiteLLM(model_name="small", router=self.llm_provider.provider)
-        # embed_llm = LiteLLM(model_name="embedding", router=self.llm_provider.provider)
-        self.embed_model = LiteLLMEmbedding(
-            model_name="azure/text-embedding-3-small", router=self.llm_provider.provider
-        )
-        logger.info("RepositoryAnalyzer initialized successfully")
-
-    def setup_repository(
-        self,
-        repo_path: str,
-        node_query: str = None,
-        file_query: str = None,
-        function_query: str = None,
-    ):
-        self.total_usage = self.llm_provider.DEFAULT_USAGE
-        self.total_files_processed = 0
-        self.node_query = node_query
-        self.file_query = file_query
-        self.function_query = function_query
-        self.embedding_usage = {"prompt_tokens": 0, "total_tokens": 0}
-        logger.info(f"Starting repository setup for: {repo_path}")
-        self.parse_repository(repo_path)
-        self.store_function_relationships()
-        logger.info("Repository setup completed successfully")
-        return self.total_files_processed, self.total_usage, self.embedding_usage
-
-    def parse_repository(self, repo_path: str):
-        logger.info(f"Parsing repository: {repo_path}")
-        with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
-            futures = []
-            for root, _, files in os.walk(repo_path):
-                for file in files:
-                    self.total_files_processed += 1
-                    if file.endswith(
-                        (".py", ".js", ".ts", ".rs")
-                    ):  # Add more extensions as needed
-                        file_path = os.path.join(root, file)
-                        futures.append(executor.submit(self.parse_file, file_path))
-
-            for future in as_completed(futures):
-                try:
-                    future.result()
-                except Exception as e:
-                    logger.error(f"Error in parsing file: {str(e)}")
-                    logger.error(traceback.format_exc())
-        logger.info("Repository parsing completed")
-
-    def parse_file(self, file_path: str):
-        logger.debug(f"Parsing file: {file_path}")
-        try:
-            with open(file_path, "r", encoding="utf-8") as file:
-                content = file.read()
-
-            language = self.get_language_from_extension(file_path)
-            chunked_code = chunk_code(content, language)
-
-            for section, items in chunked_code.items():
-                if isinstance(items, dict):
-                    for name, code_info in items.items():
-                        self.process_code_block(code_info, file_path, section, name)
-                elif isinstance(items, list):
-                    for i, code_info in enumerate(items):
-                        self.process_code_block(
-                            code_info, file_path, section, f"{section}_{i}"
-                        )
-            logger.debug(f"Successfully parsed file: {file_path}")
-        except Exception as e:
-            logger.error(f"Error processing file {file_path}: {str(e)}")
-            logger.error(traceback.format_exc())
-
-    @staticmethod
-    def get_language_from_extension(file_path: str) -> str:
-        ext = os.path.splitext(file_path)[1].lower()
-        return {
-            ".py": "python",
-            ".js": "javascript",
-            ".jsx": "javascript",
-            ".ts": "typescript",
-            ".tsx": "typescript",
-            ".rs": "rust",
-        }.get(ext, "unknown")
-
-    def process_code_block(
-        self, code_info: Dict[str, Any], file_path: str, section: str, name: str
-    ):
-        logger.debug(f"Processing code block: {section} - {name}")
-
-        if isinstance(code_info, str):
-            code = code_info
-            start_line = 1  # Default to 1 if no position information is available
-        elif isinstance(code_info, dict) and "code" in code_info:
-            code = code_info["code"]
-            start_line = code_info.get(
-                "start_line", 1
-            )  # Get start_line if available, default to 1
-        else:
-            logger.error(
-                f"Unexpected code_info format for {section} - {name}: {type(code_info)}"
-            )
-            return  # Skip this code block
-
-        language = self.get_language_from_extension(file_path)
-        abstraction, usage = self.generate_abstraction(code, language, section)
-        self.total_usage = self.llm_provider.update_usage(
-            total_usage=self.total_usage, current_usage=usage
-        )
-        function_id = self.store_code_in_db(
-            code, abstraction, file_path, section, name, start_line
-        )
-        self.store_abstraction_and_embedding(function_id, abstraction)
-
-        logger.debug(f"Finished processing code block: {section} - {name}")
-
-    def store_abstraction_and_embedding(self, function_id: int, abstraction: str):
-        logger.debug(
-            f"Storing abstraction and embedding for function_id: {function_id}"
-        )
-
-        embedding, emb_usage = self.llm_provider.get_text_embedding(abstraction)
-        self.embedding_usage = self.llm_provider.update_usage(
-            total_usage=self.embedding_usage, current_usage=emb_usage
-        )
-        embedding = embedding[0]["embedding"]
-        # Store the embedding in the database
-        # TODO: DONT PUSH DUPLICATE
-        with self.engine.begin() as connection:
-            embedding_query = text(
-                """
-                INSERT INTO function_embeddings (function_id, vector)
-                VALUES (:function_id, :vector)
-                ON CONFLICT (function_id) DO UPDATE SET vector = EXCLUDED.vector
-                """
-            )
-            connection.execute(
-                embedding_query,
-                {
-                    "function_id": function_id,
-                    "vector": embedding,
-                },
-            )
-
-        # Create a dictionary instead of TextNode
-        node = {
-            "id": str(uuid4()),
-            "text": abstraction,
-            "embedding": embedding,
-            "metadata": {"repo_id": self.repo_id, "function_id": function_id},
-        }
-
-        # Add the node to the vector store directly
-        self.vector_store.add(nodes=[node])
-
-        logger.debug(f"Abstraction and embedding stored for function_id: {function_id}")
-
-    def generate_abstraction(
-        self, code_block: str, language: str, section: str, max_tokens: int = 300
-    ) -> str:
-        prompt = f"""Analyze the following {language} code block and generate a structured abstraction. 
-Your response should be in JSON format and include the following sections:
-
-{{
-  "summary": "A concise one-sentence summary of the function's primary purpose.",
-
-  "functionality": "A detailed explanation of what the function does, including its main steps and logic. Use multiple lines if needed for clarity.",
-
-  "inputs": [
-    {{
-      "name": "The parameter name",
-      "type": "The parameter type",
-      "description": "A brief description of the parameter's purpose",
-      "default_value": "The default value, if any (or null if not applicable)"
-    }}
-  ],
-
-  "output": {{
-    "type": "The return type of the function",
-    "description": "A description of what is returned and under what conditions. Use multiple lines if needed."
-  }},
-
-  "dependencies": [
-    {{
-      "name": "Name of the external library or module",
-      "purpose": "Brief explanation of its use in this function"
-    }}
-  ],
-
-  "algorithms": [
-    {{
-      "name": "Name of the algorithm or data structure",
-      "description": "Brief explanation of its use and importance"
-    }}
-  ],
-
-  "edge_cases": [
-    "A list of potential edge cases or special conditions the function handles or should handle"
-  ],
-
-  "error_handling": "A description of how errors are handled or propagated. Include specific error types if applicable.",
-
-  "usage_context": "A brief explanation of how this function might be used by parent functions or in a larger system. Include typical scenarios and any important considerations for its use.",
-
-  "complexity": {{
-    "time": "Estimated time complexity (e.g., O(n))",
-    "space": "Estimated space complexity (e.g., O(1))",
-    "explanation": "Brief explanation of the complexity analysis"
-  }},
-
-  "tags": ["List", "of", "relevant", "tags"],
-
-  "testing_considerations": "Suggestions for unit tests or test cases to cover key functionality and edge cases",
-
-  "version_compatibility": "Information about language versions or dependency versions this code is compatible with",
-
-  "performance_considerations": "Any notes on performance optimizations or potential bottlenecks",
-
-  "security_considerations": "Any security-related notes or best practices relevant to this code",
-
-  "maintainability_score": "A subjective score from 1-10 on how easy the code is to maintain, with a brief explanation"
-}}
-
-Provide your analysis in this clear, structured JSON format. If any section is not applicable, use an empty list [] or null value as appropriate. Ensure that multi-line descriptions are properly formatted as strings.
-
-Code to analyze:
-Language: {language}
-Block Type: {section}
-Code Block: 
-```{code_block}```
-        """
-
-        estimated_prompt_tokens = len(tokenizer.encode(prompt))
-        adjusted_max_tokens = min(max(150, estimated_prompt_tokens), 1000)
-
-        try:
-            abstraction, usage = self.llm_provider.chat_completion_with_json(
-                prompt="",
-                messages=[
-                    {
-                        "role": "system",
-                        "content": "You are an expert programmer tasked with generating comprehensive and accurate abstractions of code snippets.",
-                    },
-                    {"role": "user", "content": prompt},
-                ],
-                custom_model={"max_tokens": adjusted_max_tokens, "model": "small"},
-            )
-            return json.dumps(abstraction), usage
-
-        except Exception as e:
-            raise e
-
-    def store_code_in_db(
-        self,
-        code: str,
-        abstraction: str,
-        file_path: str,
-        section: str,
-        name: str,
-        start_line: int,
-    ) -> int:
-        logger.debug(f"Storing code in DB: {file_path} - {section} - {name}")
-        clean_file_path = clean_filename(file_path)
-        with self.engine.begin() as connection:
-            # Insert into files table (assuming this part is already correct)
-            if not self.file_query:
-                self.file_query = """
-                        INSERT INTO files (repo_id, file_path, file_name, file_ext, programming_language)
-                    VALUES (:repo_id, :file_path, :file_name, :file_ext, :programming_language)
-                    ON CONFLICT (repo_id, file_path) DO UPDATE SET file_path = EXCLUDED.file_path
-                    RETURNING file_id
-                    """
-            file_id = connection.execute(
-                text(self.file_query),
-                {
-                    "repo_id": self.repo_id,
-                    "file_path": clean_file_path,
-                    "file_name": os.path.basename(clean_file_path),
-                    "file_ext": os.path.splitext(clean_file_path)[1],
-                    "programming_language": self.get_language_from_extension(file_path),
-                },
-            ).scalar_one()
-
-            # Insert into function_abstractions table
-            if not self.function_query:
-                self.function_query = """
-                    INSERT INTO function_abstractions 
-                    (file_id, function_name, function_signature, abstract_functionality, start_line, end_line)
-                    VALUES (:file_id, :function_name, :function_signature, :abstract_functionality, :start_line, :end_line)
-                    RETURNING function_id
-                        """
-            function_id = connection.execute(
-                text(self.function_query),
-                {
-                    "file_id": file_id,
-                    "function_name": name,
-                    "function_signature": "",  # You might want to extract this from the code
-                    "abstract_functionality": abstraction,
-                    "start_line": start_line,
-                    "end_line": start_line + len(code.splitlines()) - 1,
-                },
-            ).scalar_one()
-
-        logger.debug(f"Code stored in DB with function_id: {function_id}")
-        return function_id
-
-    def store_function_relationships(self):
-        logger.info("Storing function relationships")
-        with self.engine.begin() as connection:
-            for caller, callee in self.graph.edges():
-                if not self.node_query:
-                    self.node_query = """
-                        INSERT INTO node_relationships (parent_node_id, child_node_id, relationship_type)
-                        VALUES (
-                            (SELECT node_id FROM syntax_nodes WHERE node_content LIKE :caller),
-                            (SELECT node_id FROM syntax_nodes WHERE node_content LIKE :callee),
-                            'calls'
-                        )
-                        ON CONFLICT DO NOTHING
-                    """
-
-                connection.execute(
-                    text(self.node_query),
-                    {"caller": f"%{caller}%", "callee": f"%{callee}%"},
-                )
-        logger.info("Function relationships stored successfully")
-
-    def query(
-        self, query_text: str, num_results: int = 5, repo_id=None
-    ) -> List[Dict[str, Any]]:
-        embedding, emb_usage = self.llm_provider.get_text_embedding(query_text)
-        embedding = embedding[0]["embedding"]
-
-        results = self.vector_store.search(embedding, limit=num_results)
-
-        processed_results = []
-        for result in results:
-            processed_results.append(
-                {
-                    "function_id": result.payload["function_id"],
-                    "relevance_score": result.score,
-                }
-            )
-
-        # Fetch additional data from the database
-        with self.engine.connect() as connection:
-            for result in processed_results:
-                query = text(
-                    """
-                    SELECT fa.function_name, fa.abstract_functionality, f.file_path, fa.function_signature
-                    FROM function_abstractions fa
-                    JOIN files f ON fa.file_id = f.file_id
-                    WHERE fa.function_id = :function_id
-                """
-                )
-                db_result = connection.execute(
-                    query, {"function_id": result["function_id"]}
-                ).fetchone()
-                if db_result:
-                    result.update(
-                        {
-                            "function_name": db_result[0],
-                            "abstraction": db_result[1],
-                            "file_path": db_result[2],
-                            "function_signature": db_result[3],
-                        }
-                    )
-
-        return (
-            sorted(processed_results, key=lambda x: x["relevance_score"], reverse=True),
-            emb_usage,
-        )
diff --git a/kaizen/retriever/qdrant_vector_store.py b/kaizen/retriever/qdrant_vector_store.py
deleted file mode 100644
index b655abf1..00000000
--- a/kaizen/retriever/qdrant_vector_store.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from qdrant_client import QdrantClient
-from qdrant_client.models import Distance, VectorParams
-from qdrant_client.http.models import PointStruct
-from qdrant_client.http.exceptions import ResponseHandlingException
-import os
-import time
-import logging
-
-
-class QdrantVectorStore:
-    def __init__(self, collection_name, vector_size, max_retries=3, retry_delay=2):
-        self.HOST = os.getenv("QDRANT_HOST", "localhost")
-        self.PORT = os.getenv("QDRANT_PORT", "6333")
-        self.QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
-        self.collection_name = collection_name
-        self.max_retries = max_retries
-        self.retry_delay = retry_delay
-
-        self.client = self._connect_with_retry()
-        self._create_collection(vector_size)
-
-    def _connect_with_retry(self):
-        for attempt in range(self.max_retries):
-            try:
-                client = QdrantClient(
-                    self.HOST, port=self.PORT, api_key=self.QDRANT_API_KEY
-                )
-                # Test the connection
-                client.get_collections()
-                return client
-            except ResponseHandlingException as e:
-                if attempt < self.max_retries - 1:
-                    logging.warning(
-                        f"Connection attempt {attempt + 1} failed. Retrying in {self.retry_delay} seconds..."
-                    )
-                    time.sleep(self.retry_delay)
-                else:
-                    raise ConnectionError(
-                        f"Failed to connect to Qdrant server at {self.HOST}:{self.PORT} after {self.max_retries} attempts"
-                    ) from e
-
-    def _create_collection(self, vector_size):
-        try:
-            self.client.recreate_collection(
-                collection_name=self.collection_name,
-                vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
-            )
-        except Exception as e:
-            raise RuntimeError(f"Failed to create or recreate collection: {str(e)}")
-
-    def add(self, nodes):
-        points = [
-            PointStruct(
-                id=node["id"], vector=node["embedding"], payload=node["metadata"]
-            )
-            for node in nodes
-        ]
-        self.client.upsert(collection_name=self.collection_name, points=points)
-
-    def search(self, query_vector, limit=10):
-        results = self.client.search(
-            collection_name=self.collection_name, query_vector=query_vector, limit=limit
-        )
-        return results
diff --git a/kaizen/retriever/tree_sitter_utils.py b/kaizen/retriever/tree_sitter_utils.py
deleted file mode 100644
index 0f1f63ab..00000000
--- a/kaizen/retriever/tree_sitter_utils.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import tree_sitter_python
-import tree_sitter_javascript
-import tree_sitter_typescript
-import tree_sitter_rust
-from tree_sitter import Language, Parser
-from typing import Dict, Any
-import logging
-from functools import lru_cache
-
-
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-PY_LANGUAGE = Language(tree_sitter_python.language())
-JS_LANGUAGE = Language(tree_sitter_javascript.language())
-TS_LANGUAGE = Language(tree_sitter_typescript.language_typescript())
-TSX_LANGUAGE = Language(tree_sitter_typescript.language_tsx())
-RUST_LANGUAGE = Language(tree_sitter_rust.language())
-
-
-class LanguageLoader:
-    @staticmethod
-    @lru_cache(maxsize=None)
-    def load_language(language: str) -> Language:
-        language_map = {
-            "python": PY_LANGUAGE,
-            "javascript": JS_LANGUAGE,
-            "typescript": TS_LANGUAGE,
-            "rust": RUST_LANGUAGE,
-        }
-        lang = language.replace("tree-sitter-", "")
-        if lang not in language_map:
-            raise ValueError(f"Unsupported language: {language}")
-        return language_map[lang]
-
-
-class ParserFactory:
-    @staticmethod
-    @lru_cache(maxsize=None)
-    def get_parser(language: str) -> Parser:
-        try:
-            parser = Parser()
-            lang = LanguageLoader.load_language(language)
-            parser.language = lang
-            return parser
-        except Exception as e:
-            logger.error(f"Failed to create parser for {language}: {str(e)}")
-            raise
-
-
-def traverse_tree(node, code_bytes: bytes) -> Dict[str, Any]:
-    if node.type in [
-        "function_definition",
-        "function_declaration",
-        "arrow_function",
-        "method_definition",
-    ]:
-        return {
-            "type": "function",
-            "name": (
-                node.child_by_field_name("name").text.decode("utf8")
-                if node.child_by_field_name("name")
-                else "anonymous"
-            ),
-            "code": code_bytes[node.start_byte : node.end_byte].decode("utf8"),
-            "start_line": node.start_point[0],
-            "end_line": node.end_point[0],
-        }
-    elif node.type in ["class_definition", "class_declaration"]:
-        return {
-            "type": "class",
-            "name": node.child_by_field_name("name").text.decode("utf8"),
-            "code": code_bytes[node.start_byte : node.end_byte].decode("utf8"),
-            "start_line": node.start_point[0],
-            "end_line": node.end_point[0],
-        }
-    elif node.type in ["jsx_element", "jsx_self_closing_element"]:
-        return {
-            "type": "component",
-            "name": (
-                node.child_by_field_name("opening_element")
-                .child_by_field_name("name")
-                .text.decode("utf8")
-                if node.type == "jsx_element"
-                else node.child_by_field_name("name").text.decode("utf8")
-            ),
-            "code": code_bytes[node.start_byte : node.end_byte].decode("utf8"),
-            "start_line": node.start_point[0],
-            "end_line": node.end_point[0],
-        }
-    elif node.type == "impl_item":
-        return {
-            "type": "impl",
-            "name": node.child_by_field_name("type").text.decode("utf8"),
-            "code": code_bytes[node.start_byte : node.end_byte].decode("utf8"),
-            "start_line": node.start_point[0],
-            "end_line": node.end_point[0],
-        }
-    else:
-        return None
-
-
-def parse_code(node: Any, code_bytes: bytes) -> Dict[str, Any]:
-    try:
-        return traverse_tree(node, code_bytes)
-    except Exception as e:
-        logger.error(f"Failed to parse code: {str(e)}")
-        raise
-
-
-def check_language_files():
-    required_languages = ["python", "javascript", "typescript", "rust"]
-    missing_languages = []
-    for lang in required_languages:
-        try:
-            LanguageLoader.load_language(lang)
-        except Exception as e:
-            logger.warning(f"Failed to load language {lang}: {str(e)}")
-            missing_languages.append(lang)
-
-    if missing_languages:
-        logger.warning(
-            f"Missing or failed to load language files for: {', '.join(missing_languages)}"
-        )
-    else:
-        logger.info("All required language files are present and loaded successfully.")
-
-
-# Call this function at the start of your application
-check_language_files()
diff --git a/kaizen/tests/retriever/test_chunker.py b/kaizen/tests/retriever/test_chunker.py
deleted file mode 100644
index 54e6405d..00000000
--- a/kaizen/tests/retriever/test_chunker.py
+++ /dev/null
@@ -1,101 +0,0 @@
-from kaizen.retriever.code_chunker import chunk_code
-import json
-
-
-# Example usage
-python_code = """
-import math
-
-def square(x):
-    return x * x
-
-class Circle:
-    def __init__(self, radius):
-        self.radius = radius
-    
-    def area(self):
-        return math.pi * square(self.radius)
-
-if __name__ == "__main__":
-    c = Circle(5)
-    print(f"Area: {c.area()}")
-"""
-
-javascript_code = """
-import Math from 'math';
-
-function square(x) {
-    return x * x;
-}
-
-class Circle {
-    constructor(radius) {
-        this.radius = radius;
-    }
-    
-    area() {
-        return Math.PI * square(this.radius);
-    }
-}
-
-const c = new Circle(5);
-console.log(`Area: ${c.area()}`);
-"""
-
-# Example usage
-react_nextjs_code = """
-import React, { useState, useEffect } from 'react';
-import Head from 'next/head';
-
-function useCustomHook() {
-    const [value, setValue] = useState(0);
-    return [value, setValue];
-}
-
-function HomePage() {
-    const [count, setCount] = useCustomHook();
-
-    useEffect(() => {
-        document.title = `Count: ${count}`;
-    }, [count]);
-
-    return (
-        <div>
-            <Head>
-                <title>Home Page</title>
-            </Head>
-            <h1>Welcome to Next.js!</h1>
-            <p>Count: {count}</p>
-            <button onClick={() => setCount(count + 1)}>Increment</button>
-        </div>
-    );
-}
-
-export default HomePage;
-"""
-
-
-def print_chunks(language, chunks):
-    print(f"\n{language.capitalize()} Chunks:")
-    print(json.dumps(chunks, indent=2))
-    # print("\nFunctions:")
-    # for name, func in chunks["functions"].items():
-    #     print(f"\n{name}:\n{func}")
-
-    # print("\nClasses:")
-    # for name, class_info in chunks["classes"].items():
-    #     print(f"\n{name}:")
-    #     print(f"Definition:\n{class_info['definition']}")
-    #     print("Methods:")
-    #     for method_name, method in class_info["methods"].items():
-    #         print(f"\n  {method_name}:\n{method}")
-
-    # print("\nOther Blocks:")
-    # for i, block in enumerate(chunks["other_blocks"], 1):
-    #     print(f"\nBlock {i}:\n{block}")
-
-
-print_chunks("Python", chunk_code(python_code, "python"))
-print_chunks("JavaScript", chunk_code(javascript_code, "javascript"))
-print_chunks("JavaScript", chunk_code(javascript_code, "javascript"))
-print_chunks("React", chunk_code(react_nextjs_code, "javascript"))
diff --git a/pyproject.toml b/pyproject.toml
index df59aa8b..7f6c5cc7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "kaizen-cloudcode"
-version = "0.4.20"
+version = "0.4.21"
 description = "An intelligent coding companion that accelerates your development workflow by providing efficient assistance, enabling you to craft high-quality code more rapidly."
 authors = ["Saurav Panda <saurav.panda@cloudcode.ai>"]
 license = "Apache2.0"