From 50c7ed57d5387d7725226a13dde2bd651d5e7987 Mon Sep 17 00:00:00 2001 From: Saurav Panda Date: Sun, 27 Oct 2024 16:23:25 -0700 Subject: [PATCH] refc: removed rafigy to reduce load size of package --- examples/ragify_codebase/main.py | 32 -- kaizen/retriever/__init__.py | 0 kaizen/retriever/code_chunker.py | 166 --------- kaizen/retriever/feedback_system.py | 18 - kaizen/retriever/llama_index_retriever.py | 407 ---------------------- kaizen/retriever/qdrant_vector_store.py | 64 ---- kaizen/retriever/tree_sitter_utils.py | 131 ------- kaizen/tests/retriever/test_chunker.py | 101 ------ pyproject.toml | 2 +- 9 files changed, 1 insertion(+), 920 deletions(-) delete mode 100644 examples/ragify_codebase/main.py delete mode 100644 kaizen/retriever/__init__.py delete mode 100644 kaizen/retriever/code_chunker.py delete mode 100644 kaizen/retriever/feedback_system.py delete mode 100644 kaizen/retriever/llama_index_retriever.py delete mode 100644 kaizen/retriever/qdrant_vector_store.py delete mode 100644 kaizen/retriever/tree_sitter_utils.py delete mode 100644 kaizen/tests/retriever/test_chunker.py diff --git a/examples/ragify_codebase/main.py b/examples/ragify_codebase/main.py deleted file mode 100644 index dcff7207..00000000 --- a/examples/ragify_codebase/main.py +++ /dev/null @@ -1,32 +0,0 @@ -from kaizen.retriever.llama_index_retriever import RepositoryAnalyzer - -# Initialize the analyzer -analyzer = RepositoryAnalyzer() - -# Set up the repository (do this when you first analyze a repo or when you want to update it) -analyzer.setup_repository("./github_app/") - -# Perform queries (you can do this as many times as you want without calling setup_repository again) -results = analyzer.query("jwt token generation") -for result in results: - print(f"File: {result['file_path']}") - # print(f"Abstraction: {result['abstraction']}") - # print(f"result:\n{result}") - print(f"Relevance Score: {result['relevance_score']}") - print("---") - -print("....... \n\n") - -results = analyzer.query("How do you filter the results?") -for result in results: - print(f"File: {result['file_path']}") - # print(f"Abstraction: {result['abstraction']}") - # print(f"result:\n{result}") - print(f"Relevance Score: {result['relevance_score']}") - print("---") - -# # If you make changes to the repository and want to update the analysis: -# analyzer.setup_repository("/path/to/your/repo") - -# Then you can query again with the updated data -# results = analyzer.query("authentication") diff --git a/kaizen/retriever/__init__.py b/kaizen/retriever/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/kaizen/retriever/code_chunker.py b/kaizen/retriever/code_chunker.py deleted file mode 100644 index 49b0e94e..00000000 --- a/kaizen/retriever/code_chunker.py +++ /dev/null @@ -1,166 +0,0 @@ -from typing import Dict, Any -from kaizen.retriever.tree_sitter_utils import parse_code, ParserFactory -import os - -ParsedBody = Dict[str, Dict[str, Any]] - - -def chunk_code(code: str, language: str) -> ParsedBody: - parser = ParserFactory.get_parser(language) - tree = parser.parse(code.encode("utf8")) - code_bytes = code.encode("utf8") - body: ParsedBody = { - "imports": [], - "global_variables": [], - "type_definitions": [], - "functions": {}, - "async_functions": {}, - "classes": {}, - "hooks": {}, - "components": {}, - "jsx_elements": [], - "other_blocks": [], - } - - def process_node(node): - result = parse_code(node, code_bytes) - if result: - start_line = result.get("start_line", 0) - end_line = result.get("end_line", 0) - - if result["type"] == "import_statement": - body["imports"].append( - { - "code": result["code"], - "start_line": start_line, - "end_line": end_line, - } - ) - elif ( - result["type"] == "variable_declaration" - and node.parent.type == "program" - ): - body["global_variables"].append( - { - "code": result["code"], - "start_line": start_line, - "end_line": end_line, - } - ) - elif result["type"] in ["type_alias", "interface_declaration"]: - body["type_definitions"].append( - { - "name": result["name"], - "code": result["code"], - "start_line": start_line, - "end_line": end_line, - } - ) - elif result["type"] == "function": - if is_react_hook(result["name"]): - body["hooks"][result["name"]] = { - "code": result["code"], - "start_line": start_line, - "end_line": end_line, - } - elif is_react_component(result["code"]): - body["components"][result["name"]] = { - "code": result["code"], - "start_line": start_line, - "end_line": end_line, - } - elif "async" in result["code"].split()[0]: - body["async_functions"][result["name"]] = { - "code": result["code"], - "start_line": start_line, - "end_line": end_line, - } - else: - body["functions"][result["name"]] = { - "code": result["code"], - "start_line": start_line, - "end_line": end_line, - } - elif result["type"] == "class": - if is_react_component(result["code"]): - body["components"][result["name"]] = { - "code": result["code"], - "start_line": start_line, - "end_line": end_line, - } - else: - body["classes"][result["name"]] = { - "code": result["code"], - "start_line": start_line, - "end_line": end_line, - } - elif result["type"] == "jsx_element": - body["jsx_elements"].append( - { - "code": result["code"], - "start_line": start_line, - "end_line": end_line, - } - ) - else: - for child in node.children: - process_node(child) - - process_node(tree.root_node) - - # Collect remaining code as other_blocks - collected_ranges = [] - for section in body.values(): - if isinstance(section, dict): - for code_block in section.values(): - collected_ranges.append( - (code_block["start_line"], code_block["end_line"]) - ) - elif isinstance(section, list): - for code_block in section: - collected_ranges.append( - (code_block["start_line"], code_block["end_line"]) - ) - - collected_ranges.sort() - last_end = 0 - for start, end in collected_ranges: - if start > last_end: - body["other_blocks"].append(code[last_end:start].strip()) - last_end = end - if last_end < len(code): - body["other_blocks"].append(code[last_end:].strip()) - - return body - - -def is_react_hook(name: str) -> bool: - return name.startswith("use") and len(name) > 3 and name[3].isupper() - - -def is_react_component(code: str) -> bool: - return ( - "React" in code - or "jsx" in code.lower() - or "tsx" in code.lower() - or "<" in code - or "props" in code - or "render" in code - ) - - -def clean_filename(filepath): - # Split the path into components - path_components = filepath.split(os.sep) - - # Find the index of 'tmp' in the path - try: - tmp_index = path_components.index("tmp") - except ValueError: - # If 'tmp' is not found, return the original filepath - return filepath - - # Join the components after 'tmp' to create the cleaned filename - cleaned_filename = os.sep.join(path_components[tmp_index + 2 :]) - - return cleaned_filename diff --git a/kaizen/retriever/feedback_system.py b/kaizen/retriever/feedback_system.py deleted file mode 100644 index 8c47a1ec..00000000 --- a/kaizen/retriever/feedback_system.py +++ /dev/null @@ -1,18 +0,0 @@ -from typing import Dict, Any - - -class AbstractionFeedback: - def __init__(self): - self.feedback_store: Dict[str, Dict[str, Any]] = {} - - def add_feedback( - self, code_id: str, abstraction: str, rating: int, correction: str = None - ) -> None: - self.feedback_store[code_id] = { - "abstraction": abstraction, - "rating": rating, - "correction": correction, - } - - def get_feedback(self, code_id: str) -> Dict[str, Any]: - return self.feedback_store.get(code_id, None) diff --git a/kaizen/retriever/llama_index_retriever.py b/kaizen/retriever/llama_index_retriever.py deleted file mode 100644 index 29b814b5..00000000 --- a/kaizen/retriever/llama_index_retriever.py +++ /dev/null @@ -1,407 +0,0 @@ -import os -import logging -from uuid import uuid4 -from llama_index.llms.litellm import LiteLLM -import networkx as nx -from typing import List, Dict, Any -from concurrent.futures import ThreadPoolExecutor, as_completed -import tiktoken -from kaizen.llms.provider import LLMProvider -from kaizen.retriever.code_chunker import chunk_code, clean_filename -import traceback -from llama_index.embeddings.litellm import LiteLLMEmbedding -from sqlalchemy import create_engine, text -from kaizen.retriever.qdrant_vector_store import QdrantVectorStore -import json - -# Set up logging -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) -logger = logging.getLogger(__name__) - -# Initialize tokenizer -tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo") - - -class RepositoryAnalyzer: - def __init__(self, repo_id=1): - logger.info("Initializing RepositoryAnalyzer") - self.engine = create_engine( - f"postgresql://{os.environ['POSTGRES_USER']}:{os.environ['POSTGRES_PASSWORD']}@{os.environ['POSTGRES_HOST']}:{os.environ['POSTGRES_PORT']}/{os.environ['POSTGRES_DB']}", - pool_size=10, - max_overflow=20, - ) - self.repo_id = repo_id - self.graph = nx.DiGraph() - self.vector_store = QdrantVectorStore("embeddings", vector_size=1536) - self.llm_provider = LLMProvider() - self.llm = LiteLLM(model_name="small", router=self.llm_provider.provider) - # embed_llm = LiteLLM(model_name="embedding", router=self.llm_provider.provider) - self.embed_model = LiteLLMEmbedding( - model_name="azure/text-embedding-3-small", router=self.llm_provider.provider - ) - logger.info("RepositoryAnalyzer initialized successfully") - - def setup_repository( - self, - repo_path: str, - node_query: str = None, - file_query: str = None, - function_query: str = None, - ): - self.total_usage = self.llm_provider.DEFAULT_USAGE - self.total_files_processed = 0 - self.node_query = node_query - self.file_query = file_query - self.function_query = function_query - self.embedding_usage = {"prompt_tokens": 0, "total_tokens": 0} - logger.info(f"Starting repository setup for: {repo_path}") - self.parse_repository(repo_path) - self.store_function_relationships() - logger.info("Repository setup completed successfully") - return self.total_files_processed, self.total_usage, self.embedding_usage - - def parse_repository(self, repo_path: str): - logger.info(f"Parsing repository: {repo_path}") - with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: - futures = [] - for root, _, files in os.walk(repo_path): - for file in files: - self.total_files_processed += 1 - if file.endswith( - (".py", ".js", ".ts", ".rs") - ): # Add more extensions as needed - file_path = os.path.join(root, file) - futures.append(executor.submit(self.parse_file, file_path)) - - for future in as_completed(futures): - try: - future.result() - except Exception as e: - logger.error(f"Error in parsing file: {str(e)}") - logger.error(traceback.format_exc()) - logger.info("Repository parsing completed") - - def parse_file(self, file_path: str): - logger.debug(f"Parsing file: {file_path}") - try: - with open(file_path, "r", encoding="utf-8") as file: - content = file.read() - - language = self.get_language_from_extension(file_path) - chunked_code = chunk_code(content, language) - - for section, items in chunked_code.items(): - if isinstance(items, dict): - for name, code_info in items.items(): - self.process_code_block(code_info, file_path, section, name) - elif isinstance(items, list): - for i, code_info in enumerate(items): - self.process_code_block( - code_info, file_path, section, f"{section}_{i}" - ) - logger.debug(f"Successfully parsed file: {file_path}") - except Exception as e: - logger.error(f"Error processing file {file_path}: {str(e)}") - logger.error(traceback.format_exc()) - - @staticmethod - def get_language_from_extension(file_path: str) -> str: - ext = os.path.splitext(file_path)[1].lower() - return { - ".py": "python", - ".js": "javascript", - ".jsx": "javascript", - ".ts": "typescript", - ".tsx": "typescript", - ".rs": "rust", - }.get(ext, "unknown") - - def process_code_block( - self, code_info: Dict[str, Any], file_path: str, section: str, name: str - ): - logger.debug(f"Processing code block: {section} - {name}") - - if isinstance(code_info, str): - code = code_info - start_line = 1 # Default to 1 if no position information is available - elif isinstance(code_info, dict) and "code" in code_info: - code = code_info["code"] - start_line = code_info.get( - "start_line", 1 - ) # Get start_line if available, default to 1 - else: - logger.error( - f"Unexpected code_info format for {section} - {name}: {type(code_info)}" - ) - return # Skip this code block - - language = self.get_language_from_extension(file_path) - abstraction, usage = self.generate_abstraction(code, language, section) - self.total_usage = self.llm_provider.update_usage( - total_usage=self.total_usage, current_usage=usage - ) - function_id = self.store_code_in_db( - code, abstraction, file_path, section, name, start_line - ) - self.store_abstraction_and_embedding(function_id, abstraction) - - logger.debug(f"Finished processing code block: {section} - {name}") - - def store_abstraction_and_embedding(self, function_id: int, abstraction: str): - logger.debug( - f"Storing abstraction and embedding for function_id: {function_id}" - ) - - embedding, emb_usage = self.llm_provider.get_text_embedding(abstraction) - self.embedding_usage = self.llm_provider.update_usage( - total_usage=self.embedding_usage, current_usage=emb_usage - ) - embedding = embedding[0]["embedding"] - # Store the embedding in the database - # TODO: DONT PUSH DUPLICATE - with self.engine.begin() as connection: - embedding_query = text( - """ - INSERT INTO function_embeddings (function_id, vector) - VALUES (:function_id, :vector) - ON CONFLICT (function_id) DO UPDATE SET vector = EXCLUDED.vector - """ - ) - connection.execute( - embedding_query, - { - "function_id": function_id, - "vector": embedding, - }, - ) - - # Create a dictionary instead of TextNode - node = { - "id": str(uuid4()), - "text": abstraction, - "embedding": embedding, - "metadata": {"repo_id": self.repo_id, "function_id": function_id}, - } - - # Add the node to the vector store directly - self.vector_store.add(nodes=[node]) - - logger.debug(f"Abstraction and embedding stored for function_id: {function_id}") - - def generate_abstraction( - self, code_block: str, language: str, section: str, max_tokens: int = 300 - ) -> str: - prompt = f"""Analyze the following {language} code block and generate a structured abstraction. -Your response should be in JSON format and include the following sections: - -{{ - "summary": "A concise one-sentence summary of the function's primary purpose.", - - "functionality": "A detailed explanation of what the function does, including its main steps and logic. Use multiple lines if needed for clarity.", - - "inputs": [ - {{ - "name": "The parameter name", - "type": "The parameter type", - "description": "A brief description of the parameter's purpose", - "default_value": "The default value, if any (or null if not applicable)" - }} - ], - - "output": {{ - "type": "The return type of the function", - "description": "A description of what is returned and under what conditions. Use multiple lines if needed." - }}, - - "dependencies": [ - {{ - "name": "Name of the external library or module", - "purpose": "Brief explanation of its use in this function" - }} - ], - - "algorithms": [ - {{ - "name": "Name of the algorithm or data structure", - "description": "Brief explanation of its use and importance" - }} - ], - - "edge_cases": [ - "A list of potential edge cases or special conditions the function handles or should handle" - ], - - "error_handling": "A description of how errors are handled or propagated. Include specific error types if applicable.", - - "usage_context": "A brief explanation of how this function might be used by parent functions or in a larger system. Include typical scenarios and any important considerations for its use.", - - "complexity": {{ - "time": "Estimated time complexity (e.g., O(n))", - "space": "Estimated space complexity (e.g., O(1))", - "explanation": "Brief explanation of the complexity analysis" - }}, - - "tags": ["List", "of", "relevant", "tags"], - - "testing_considerations": "Suggestions for unit tests or test cases to cover key functionality and edge cases", - - "version_compatibility": "Information about language versions or dependency versions this code is compatible with", - - "performance_considerations": "Any notes on performance optimizations or potential bottlenecks", - - "security_considerations": "Any security-related notes or best practices relevant to this code", - - "maintainability_score": "A subjective score from 1-10 on how easy the code is to maintain, with a brief explanation" -}} - -Provide your analysis in this clear, structured JSON format. If any section is not applicable, use an empty list [] or null value as appropriate. Ensure that multi-line descriptions are properly formatted as strings. - -Code to analyze: -Language: {language} -Block Type: {section} -Code Block: -```{code_block}``` - """ - - estimated_prompt_tokens = len(tokenizer.encode(prompt)) - adjusted_max_tokens = min(max(150, estimated_prompt_tokens), 1000) - - try: - abstraction, usage = self.llm_provider.chat_completion_with_json( - prompt="", - messages=[ - { - "role": "system", - "content": "You are an expert programmer tasked with generating comprehensive and accurate abstractions of code snippets.", - }, - {"role": "user", "content": prompt}, - ], - custom_model={"max_tokens": adjusted_max_tokens, "model": "small"}, - ) - return json.dumps(abstraction), usage - - except Exception as e: - raise e - - def store_code_in_db( - self, - code: str, - abstraction: str, - file_path: str, - section: str, - name: str, - start_line: int, - ) -> int: - logger.debug(f"Storing code in DB: {file_path} - {section} - {name}") - clean_file_path = clean_filename(file_path) - with self.engine.begin() as connection: - # Insert into files table (assuming this part is already correct) - if not self.file_query: - self.file_query = """ - INSERT INTO files (repo_id, file_path, file_name, file_ext, programming_language) - VALUES (:repo_id, :file_path, :file_name, :file_ext, :programming_language) - ON CONFLICT (repo_id, file_path) DO UPDATE SET file_path = EXCLUDED.file_path - RETURNING file_id - """ - file_id = connection.execute( - text(self.file_query), - { - "repo_id": self.repo_id, - "file_path": clean_file_path, - "file_name": os.path.basename(clean_file_path), - "file_ext": os.path.splitext(clean_file_path)[1], - "programming_language": self.get_language_from_extension(file_path), - }, - ).scalar_one() - - # Insert into function_abstractions table - if not self.function_query: - self.function_query = """ - INSERT INTO function_abstractions - (file_id, function_name, function_signature, abstract_functionality, start_line, end_line) - VALUES (:file_id, :function_name, :function_signature, :abstract_functionality, :start_line, :end_line) - RETURNING function_id - """ - function_id = connection.execute( - text(self.function_query), - { - "file_id": file_id, - "function_name": name, - "function_signature": "", # You might want to extract this from the code - "abstract_functionality": abstraction, - "start_line": start_line, - "end_line": start_line + len(code.splitlines()) - 1, - }, - ).scalar_one() - - logger.debug(f"Code stored in DB with function_id: {function_id}") - return function_id - - def store_function_relationships(self): - logger.info("Storing function relationships") - with self.engine.begin() as connection: - for caller, callee in self.graph.edges(): - if not self.node_query: - self.node_query = """ - INSERT INTO node_relationships (parent_node_id, child_node_id, relationship_type) - VALUES ( - (SELECT node_id FROM syntax_nodes WHERE node_content LIKE :caller), - (SELECT node_id FROM syntax_nodes WHERE node_content LIKE :callee), - 'calls' - ) - ON CONFLICT DO NOTHING - """ - - connection.execute( - text(self.node_query), - {"caller": f"%{caller}%", "callee": f"%{callee}%"}, - ) - logger.info("Function relationships stored successfully") - - def query( - self, query_text: str, num_results: int = 5, repo_id=None - ) -> List[Dict[str, Any]]: - embedding, emb_usage = self.llm_provider.get_text_embedding(query_text) - embedding = embedding[0]["embedding"] - - results = self.vector_store.search(embedding, limit=num_results) - - processed_results = [] - for result in results: - processed_results.append( - { - "function_id": result.payload["function_id"], - "relevance_score": result.score, - } - ) - - # Fetch additional data from the database - with self.engine.connect() as connection: - for result in processed_results: - query = text( - """ - SELECT fa.function_name, fa.abstract_functionality, f.file_path, fa.function_signature - FROM function_abstractions fa - JOIN files f ON fa.file_id = f.file_id - WHERE fa.function_id = :function_id - """ - ) - db_result = connection.execute( - query, {"function_id": result["function_id"]} - ).fetchone() - if db_result: - result.update( - { - "function_name": db_result[0], - "abstraction": db_result[1], - "file_path": db_result[2], - "function_signature": db_result[3], - } - ) - - return ( - sorted(processed_results, key=lambda x: x["relevance_score"], reverse=True), - emb_usage, - ) diff --git a/kaizen/retriever/qdrant_vector_store.py b/kaizen/retriever/qdrant_vector_store.py deleted file mode 100644 index b655abf1..00000000 --- a/kaizen/retriever/qdrant_vector_store.py +++ /dev/null @@ -1,64 +0,0 @@ -from qdrant_client import QdrantClient -from qdrant_client.models import Distance, VectorParams -from qdrant_client.http.models import PointStruct -from qdrant_client.http.exceptions import ResponseHandlingException -import os -import time -import logging - - -class QdrantVectorStore: - def __init__(self, collection_name, vector_size, max_retries=3, retry_delay=2): - self.HOST = os.getenv("QDRANT_HOST", "localhost") - self.PORT = os.getenv("QDRANT_PORT", "6333") - self.QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") - self.collection_name = collection_name - self.max_retries = max_retries - self.retry_delay = retry_delay - - self.client = self._connect_with_retry() - self._create_collection(vector_size) - - def _connect_with_retry(self): - for attempt in range(self.max_retries): - try: - client = QdrantClient( - self.HOST, port=self.PORT, api_key=self.QDRANT_API_KEY - ) - # Test the connection - client.get_collections() - return client - except ResponseHandlingException as e: - if attempt < self.max_retries - 1: - logging.warning( - f"Connection attempt {attempt + 1} failed. Retrying in {self.retry_delay} seconds..." - ) - time.sleep(self.retry_delay) - else: - raise ConnectionError( - f"Failed to connect to Qdrant server at {self.HOST}:{self.PORT} after {self.max_retries} attempts" - ) from e - - def _create_collection(self, vector_size): - try: - self.client.recreate_collection( - collection_name=self.collection_name, - vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE), - ) - except Exception as e: - raise RuntimeError(f"Failed to create or recreate collection: {str(e)}") - - def add(self, nodes): - points = [ - PointStruct( - id=node["id"], vector=node["embedding"], payload=node["metadata"] - ) - for node in nodes - ] - self.client.upsert(collection_name=self.collection_name, points=points) - - def search(self, query_vector, limit=10): - results = self.client.search( - collection_name=self.collection_name, query_vector=query_vector, limit=limit - ) - return results diff --git a/kaizen/retriever/tree_sitter_utils.py b/kaizen/retriever/tree_sitter_utils.py deleted file mode 100644 index 0f1f63ab..00000000 --- a/kaizen/retriever/tree_sitter_utils.py +++ /dev/null @@ -1,131 +0,0 @@ -import tree_sitter_python -import tree_sitter_javascript -import tree_sitter_typescript -import tree_sitter_rust -from tree_sitter import Language, Parser -from typing import Dict, Any -import logging -from functools import lru_cache - - -# Set up logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -PY_LANGUAGE = Language(tree_sitter_python.language()) -JS_LANGUAGE = Language(tree_sitter_javascript.language()) -TS_LANGUAGE = Language(tree_sitter_typescript.language_typescript()) -TSX_LANGUAGE = Language(tree_sitter_typescript.language_tsx()) -RUST_LANGUAGE = Language(tree_sitter_rust.language()) - - -class LanguageLoader: - @staticmethod - @lru_cache(maxsize=None) - def load_language(language: str) -> Language: - language_map = { - "python": PY_LANGUAGE, - "javascript": JS_LANGUAGE, - "typescript": TS_LANGUAGE, - "rust": RUST_LANGUAGE, - } - lang = language.replace("tree-sitter-", "") - if lang not in language_map: - raise ValueError(f"Unsupported language: {language}") - return language_map[lang] - - -class ParserFactory: - @staticmethod - @lru_cache(maxsize=None) - def get_parser(language: str) -> Parser: - try: - parser = Parser() - lang = LanguageLoader.load_language(language) - parser.language = lang - return parser - except Exception as e: - logger.error(f"Failed to create parser for {language}: {str(e)}") - raise - - -def traverse_tree(node, code_bytes: bytes) -> Dict[str, Any]: - if node.type in [ - "function_definition", - "function_declaration", - "arrow_function", - "method_definition", - ]: - return { - "type": "function", - "name": ( - node.child_by_field_name("name").text.decode("utf8") - if node.child_by_field_name("name") - else "anonymous" - ), - "code": code_bytes[node.start_byte : node.end_byte].decode("utf8"), - "start_line": node.start_point[0], - "end_line": node.end_point[0], - } - elif node.type in ["class_definition", "class_declaration"]: - return { - "type": "class", - "name": node.child_by_field_name("name").text.decode("utf8"), - "code": code_bytes[node.start_byte : node.end_byte].decode("utf8"), - "start_line": node.start_point[0], - "end_line": node.end_point[0], - } - elif node.type in ["jsx_element", "jsx_self_closing_element"]: - return { - "type": "component", - "name": ( - node.child_by_field_name("opening_element") - .child_by_field_name("name") - .text.decode("utf8") - if node.type == "jsx_element" - else node.child_by_field_name("name").text.decode("utf8") - ), - "code": code_bytes[node.start_byte : node.end_byte].decode("utf8"), - "start_line": node.start_point[0], - "end_line": node.end_point[0], - } - elif node.type == "impl_item": - return { - "type": "impl", - "name": node.child_by_field_name("type").text.decode("utf8"), - "code": code_bytes[node.start_byte : node.end_byte].decode("utf8"), - "start_line": node.start_point[0], - "end_line": node.end_point[0], - } - else: - return None - - -def parse_code(node: Any, code_bytes: bytes) -> Dict[str, Any]: - try: - return traverse_tree(node, code_bytes) - except Exception as e: - logger.error(f"Failed to parse code: {str(e)}") - raise - - -def check_language_files(): - required_languages = ["python", "javascript", "typescript", "rust"] - missing_languages = [] - for lang in required_languages: - try: - LanguageLoader.load_language(lang) - except Exception as e: - logger.warning(f"Failed to load language {lang}: {str(e)}") - missing_languages.append(lang) - - if missing_languages: - logger.warning( - f"Missing or failed to load language files for: {', '.join(missing_languages)}" - ) - else: - logger.info("All required language files are present and loaded successfully.") - - -# Call this function at the start of your application -check_language_files() diff --git a/kaizen/tests/retriever/test_chunker.py b/kaizen/tests/retriever/test_chunker.py deleted file mode 100644 index 54e6405d..00000000 --- a/kaizen/tests/retriever/test_chunker.py +++ /dev/null @@ -1,101 +0,0 @@ -from kaizen.retriever.code_chunker import chunk_code -import json - - -# Example usage -python_code = """ -import math - -def square(x): - return x * x - -class Circle: - def __init__(self, radius): - self.radius = radius - - def area(self): - return math.pi * square(self.radius) - -if __name__ == "__main__": - c = Circle(5) - print(f"Area: {c.area()}") -""" - -javascript_code = """ -import Math from 'math'; - -function square(x) { - return x * x; -} - -class Circle { - constructor(radius) { - this.radius = radius; - } - - area() { - return Math.PI * square(this.radius); - } -} - -const c = new Circle(5); -console.log(`Area: ${c.area()}`); -""" - -# Example usage -react_nextjs_code = """ -import React, { useState, useEffect } from 'react'; -import Head from 'next/head'; - -function useCustomHook() { - const [value, setValue] = useState(0); - return [value, setValue]; -} - -function HomePage() { - const [count, setCount] = useCustomHook(); - - useEffect(() => { - document.title = `Count: ${count}`; - }, [count]); - - return ( -
- - Home Page - -

Welcome to Next.js!

-

Count: {count}

- -
- ); -} - -export default HomePage; -""" - - -def print_chunks(language, chunks): - print(f"\n{language.capitalize()} Chunks:") - print(json.dumps(chunks, indent=2)) - # print("\nFunctions:") - # for name, func in chunks["functions"].items(): - # print(f"\n{name}:\n{func}") - - # print("\nClasses:") - # for name, class_info in chunks["classes"].items(): - # print(f"\n{name}:") - # print(f"Definition:\n{class_info['definition']}") - # print("Methods:") - # for method_name, method in class_info["methods"].items(): - # print(f"\n {method_name}:\n{method}") - - # print("\nOther Blocks:") - # for i, block in enumerate(chunks["other_blocks"], 1): - # print(f"\nBlock {i}:\n{block}") - - -print_chunks("Python", chunk_code(python_code, "python")) -print_chunks("JavaScript", chunk_code(javascript_code, "javascript")) -print_chunks("JavaScript", chunk_code(javascript_code, "javascript")) -print_chunks("React", chunk_code(react_nextjs_code, "javascript")) diff --git a/pyproject.toml b/pyproject.toml index df59aa8b..7f6c5cc7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "kaizen-cloudcode" -version = "0.4.20" +version = "0.4.21" description = "An intelligent coding companion that accelerates your development workflow by providing efficient assistance, enabling you to craft high-quality code more rapidly." authors = ["Saurav Panda "] license = "Apache2.0"