Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add cleaner module #4

Merged
merged 1 commit into from
Nov 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions 01_intro.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,26 @@
"docs[0:2]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"str"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(docs[0].page_content)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
13 changes: 11 additions & 2 deletions src/app/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import streamlit as st
from src.loaders.app import loader_use_case
from src.loaders.adapters import create_file_adapter
from src.loaders.adapters import create_file_adapter,create_document_adapter
from src.cleaner.app import cleaner_use_case
def header():
st.title("Actividad 01")
st.subheader("Carga de Documentos")
Expand All @@ -17,5 +18,13 @@ def launch_app():
st.write("Tamaño del archivo: ", uploaded_file.size)
st.write("Guardando archivo en un Bucket...")
file = create_file_adapter(uploaded_file)
loader_use_case.upload_to_bucket(file)
docs=loader_use_case.upload_to_bucket(file)
docs = create_document_adapter(docs)
text_example = 'Hi, this is an example for apply a cleaning process to a text.'
st.write("Texto de ejemplo:")
st.write(text_example)
text_cleaned = cleaner_use_case.clean_text(text_example)
st.write("Texto limpio:")
st.write(text_cleaned)

st.write("Archivo guardado en la carpeta /bucket.")
10 changes: 10 additions & 0 deletions src/cleaner/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from .domain import CleanerRepository
from .infrastructure import SpacyRepository
from .use_case import CleanerUseCase

__all__ = [
"CleanerRepository",
"SpacyRepository",
"CleanerUseCase"
]

5 changes: 5 additions & 0 deletions src/cleaner/app/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .spacy_instance import cleaner_use_case

__all__ = [
"cleaner_use_case"
]
5 changes: 5 additions & 0 deletions src/cleaner/app/spacy_instance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from ..infrastructure import SpacyRepository
from ..use_case import CleanerUseCase

spacy_repository = SpacyRepository()
cleaner_use_case = CleanerUseCase(spacy_repository)
5 changes: 5 additions & 0 deletions src/cleaner/domain/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .repository import CleanerRepository

__all__ = [
"CleanerRepository"
]
13 changes: 13 additions & 0 deletions src/cleaner/domain/repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from abc import ABC, abstractmethod
class CleanerRepository(ABC):
@abstractmethod
def remove_stopwords(self, text:str) -> str:
pass

@abstractmethod
def to_lower(self, text:str) -> str:
pass

@abstractmethod
def lemmatization(self, text:str) -> str:
pass
5 changes: 5 additions & 0 deletions src/cleaner/infrastructure/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .spacy_repository import SpacyRepository

__all__ = [
"SpacyRepository"
]
22 changes: 22 additions & 0 deletions src/cleaner/infrastructure/spacy_repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from ..domain import CleanerRepository
import spacy
# antes de corred la celda: python -m spacy download en_core_web_sm
class SpacyRepository(CleanerRepository):
def __init__(self):
self.nlp = spacy.load("en_core_web_sm")

def to_lower(self, text: str) -> str:
return text.lower()

def remove_stopwords(self, text: str) -> str:
doc = self.nlp(text)
tokens = [token.text for token in doc if not token.is_stop]
return " ".join(tokens)

def lemmatization(self, text: str) -> str:
doc = self.nlp(text)
tokens = [token.lemma_ for token in doc]
return " ".join(tokens)



5 changes: 5 additions & 0 deletions src/cleaner/use_case/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .cleaner_use_case import CleanerUseCase

__all__ = [
"CleanerUseCase"
]
11 changes: 11 additions & 0 deletions src/cleaner/use_case/cleaner_use_case.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from ..domain import CleanerRepository
class CleanerUseCase:
def __init__(self, cleaner_repo:CleanerRepository):
self.cleaner_repo = cleaner_repo

def clean_text(self, text:str)->str:
text_cleaned=self.cleaner_repo.to_lower(text)
text_cleaned=self.cleaner_repo.remove_stopwords(text_cleaned)
text_cleaned=self.cleaner_repo.lemmatization(text_cleaned)
return text_cleaned

4 changes: 3 additions & 1 deletion src/loaders/adapters/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from .file_adapter import create_file_adapter
from .document_adapter import create_document_adapter

__all__ = [
"create_file_adapter"
"create_file_adapter",
"create_document_adapter"
]
5 changes: 5 additions & 0 deletions src/loaders/adapters/document_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from langchain.schema import Document
from src.loaders.domain import DocumentValue

def create_document_adapter(docs:list[DocumentValue])->list[Document]:
return [Document(page_content=doc.content,metadata=doc.metadata) for doc in docs]
5 changes: 3 additions & 2 deletions src/loaders/app/local_instance.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from src.loaders.infrastructure import LocalRepository
from src.loaders.infrastructure import LocalRepository,LangchainLoaderRepository
from src.loaders.use_cases import LoaderUseCase

local_repository = LocalRepository()
loader_use_case = LoaderUseCase(local_repository)
langchain_repository = LangchainLoaderRepository()
loader_use_case = LoaderUseCase(langchain_repository,local_repository)
6 changes: 4 additions & 2 deletions src/loaders/domain/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from .entity import FileEntity
from .repository import LoaderRepository
from .value import FileValue
from .repository import LoaderRepository,BucketRepository
from .value import FileValue,DocumentValue
__all__ = [
"FileEntity",
"LoaderRepository",
"FileValue",
"DocumentValue",
"BucketRepository"
]
7 changes: 6 additions & 1 deletion src/loaders/domain/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,9 @@ class FileEntity(ABC):
name: str
type: str
size: int
content: bytes
content: bytes

@dataclass
class DocumentEntity(ABC):
content:str
metadata: dict
10 changes: 8 additions & 2 deletions src/loaders/domain/repository.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
from abc import ABC, abstractmethod
from .value import FileValue
from .value import FileValue,DocumentValue
class LoaderRepository(ABC):

@abstractmethod
def upload(self, file: FileValue) -> dict:
def load_file(self, path:str) -> list[DocumentValue]:
pass

class BucketRepository(ABC):
@abstractmethod
def upload(self, file: FileValue) -> str:
pass
4 changes: 3 additions & 1 deletion src/loaders/domain/value.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from .entity import FileEntity
from .entity import FileEntity,DocumentEntity
class FileValue(FileEntity):
pass
class DocumentValue(DocumentEntity):
pass
6 changes: 4 additions & 2 deletions src/loaders/infrastructure/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from .local_repository import LocalRepository
from .local_bucket_repository import LocalRepository
from .langchain_loader_repository import LangchainLoaderRepository

__all__ = [
"LocalRepository"
"LocalRepository",
"LangchainLoaderRepository",
]
12 changes: 12 additions & 0 deletions src/loaders/infrastructure/langchain_loader_repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from src.loaders.domain import LoaderRepository,DocumentValue
from langchain_community.document_loaders import PyPDFLoader

class LangchainLoaderRepository(LoaderRepository):
def load_file(self, path: str)->list[DocumentValue]:
loader = PyPDFLoader(file_path=path)
docs = loader.load()
domain_docs = [
DocumentValue(content=doc.page_content, metadata=doc.metadata)
for doc in docs
]
return domain_docs
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
import os
from src.loaders.domain import LoaderRepository
from src.loaders.domain import BucketRepository
from src.config import BUCKET_PATH

class LocalRepository(LoaderRepository):
def upload(self, file):
class LocalRepository(BucketRepository):
def upload(self, file)->str:
# Crear la carpeta /bucket si no existe
if not os.path.exists(BUCKET_PATH):
os.makedirs(BUCKET_PATH)

# Guardar el archivo en la carpeta /bucket
with open(os.path.join(BUCKET_PATH, file.name), "wb") as f:
file_path = os.path.join(BUCKET_PATH, file.name)
with open(file_path, "wb") as f:
f.write(file.content)


return file_path



11 changes: 7 additions & 4 deletions src/loaders/use_cases/loader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from src.loaders.domain import LoaderRepository,FileValue
from src.loaders.domain import LoaderRepository,FileValue,BucketRepository,DocumentValue
class LoaderUseCase:
def __init__(self, loader: LoaderRepository):
def __init__(self, loader: LoaderRepository,bucket:BucketRepository):
self.loader = loader
self.bucket = bucket

def upload_to_bucket(self, file:FileValue) -> dict:
return self.loader.upload(file)
def upload_to_bucket(self, file:FileValue) -> list[DocumentValue]:
pathname = self.bucket.upload(file)
docs = self.loader.load_file(pathname)
return docs