Skip to content

Commit

Permalink
add colbert
Browse files Browse the repository at this point in the history
add colbert embedding
  • Loading branch information
Gautam-Rajeev committed Mar 19, 2024
1 parent 695419d commit 42c7ce1
Show file tree
Hide file tree
Showing 23 changed files with 184 additions and 227 deletions.
1 change: 0 additions & 1 deletion src/embeddings/bert/README.md

This file was deleted.

27 changes: 27 additions & 0 deletions src/embeddings/colbert/local/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Use an official Python runtime as a parent image
FROM python:3.9-slim

WORKDIR /app

RUN apt-get update && apt-get install -y \
build-essential \
git \
&& rm -rf /var/lib/apt/lists/*

# Install requirements
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt

RUN apt-get update && apt-get install -y wget
# Download necessary files
RUN gdown "https://drive.google.com/uc?id=1VlLcGWmDKAoK3aUthVXOFxzOdgzf-SNo" -O Testing1.csv

# Clone necessary repositories
RUN apt-get update && apt-get install -y git
RUN git clone https://huggingface.co/GautamR/colbert_agri_embeddings

# Copy the rest of the application code to the working directory
COPY . /app/
EXPOSE 8000
# Set the entrypoint for the container
CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]
1 change: 1 addition & 0 deletions src/embeddings/colbert/local/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

2 changes: 2 additions & 0 deletions src/embeddings/colbert/local/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .request import ModelRequest
from .request import Model
26 changes: 26 additions & 0 deletions src/embeddings/colbert/local/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from model import Model
from request import ModelRequest
from quart import Quart, request
import aiohttp
import pandas as pd
import gdown

app = Quart(__name__)

model = None

@app.before_serving
async def startup():
app.client = aiohttp.ClientSession()
global model
model = Model(app)

@app.route('/', methods=['POST'])
async def embed():
global model
data = await request.get_json()
req = ModelRequest(**data)
return await model.inference(req)

if __name__ == "__main__":
app.run()
31 changes: 31 additions & 0 deletions src/embeddings/colbert/local/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import pandas as pd
from ragatouille import RAGPretrainedModel
from request import ModelRequest
from colbert import Indexer, Searcher
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection



class Model():
def __new__(cls, context):
cls.context = context
if not hasattr(cls, 'instance'):
cls.instance = super(Model, cls).__new__(cls)
# Initialize Colbert
cls.df = pd.read_csv('/Testing1.csv')
cls.df['PID'] = cls.df.index.astype(str)
with Run().context(RunConfig(experiment='notebook')):
cls.searcher = Searcher(index='/colbert_agri_embeddings/', collection=cls.df['content'].to_list())
print(cls.df.columns)

return cls.instance

async def inference(self, request: ModelRequest):
query = request.text
k = request.k
column_returned = 'id'
results = self.searcher.search(query, k)
searched_ids = self.df.loc[results[0], column_returned].to_list()
searched_content = self.df.loc[results[0], 'content'].to_list()
return {"ids": searched_ids, "content": searched_content, "scores": results[2]}
12 changes: 12 additions & 0 deletions src/embeddings/colbert/local/request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import requests
import json


class ModelRequest():
def __init__(self, text, k ):
self.text = text
self.k = k

def to_json(self):
return json.dumps(self, default=lambda o: o.__dict__,
sort_keys=True, indent=4)
12 changes: 12 additions & 0 deletions src/embeddings/colbert/local/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
torch
scikit-learn
quart
aiohttp
pandas
faiss-gpu
datasets
gdown
ragatouille
langchain-openai
colbert-ai
gdown
33 changes: 9 additions & 24 deletions src/embeddings/instructor/local/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import aiohttp
import pandas as pd
import io
from quart import jsonify

app = Quart(__name__)

Expand All @@ -16,37 +15,23 @@ async def startup():
global model
model = Model(app)


@app.route('/', methods=['POST'])
async def embed():
global model
data = await request.get_json()
files = await request.files
uploaded_file = files.get('file')
files = await request.files # await the coroutine
uploaded_file = files.get('file') # now you can use .get()

if uploaded_file:
df = pd.read_csv(uploaded_file.stream)
if df.empty or df['content'].isnull().any():
return jsonify({'error': 'There are nonzero null rows'}), 400 # Return a 400 Bad Request response with the error message

req = ModelRequest(df=df)
req = ModelRequest(df=df) # Pass the DataFrame to ModelRequest
response = await model.inference(req)

# If the response from the model is an error message, return it with a 400 status
if response == 'There are nonzero null rows':
return jsonify({'error': response}), 400

# Otherwise, assume response is a CSV string
df = pd.read_csv(io.StringIO(response))
df = pd.read_csv(io.StringIO(response)) # Convert the CSV string back to a DataFrame
# Save the DataFrame to a CSV file
df.to_csv('output.csv', index=False)

return await send_file('output.csv', mimetype='text/csv', as_attachment=True, attachment_filename='output.csv')
else:

else:
req = ModelRequest(**data)
response = await model.inference(req)

# Handle potential error from model inference in a similar way
if response == 'There are nonzero null rows':
return jsonify({'error': response}), 400

# Otherwise, send back the model's response
return response
return await model.inference(req)
49 changes: 13 additions & 36 deletions src/embeddings/instructor/local/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import wget
import pandas as pd
import os
from quart import jsonify # Import jsonify to send JSON responses


class Model():
def __new__(cls, context):
Expand All @@ -18,23 +16,13 @@ def __new__(cls, context):

async def inference(self, request: ModelRequest):
# Modify this function according to model requirements such that inputs and output remains the same
corpus_instruction = "Represent the document for retrieval:"
query_instruction = 'Represent the question for retrieving supporting documents: '
corpus_instruction = "Represent the Wikipedia document for retrieval:"
query_instruction = 'Represent the Wikipedia question for retrieving supporting documents: '
query = request.query
query_type = request.query_type

if(query != None):
# print('Query Encoding Process :-')
if query_type == 'retrieval':
query_embeddings = self.model.encode(
[[corpus_instruction, query]],
show_progress_bar=False,
batch_size=32,
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
)

else :
query_embeddings = self.model.encode(
query_embeddings = self.model.encode(
[[query_instruction, query]],
show_progress_bar=False,
batch_size=32,
Expand All @@ -45,26 +33,15 @@ async def inference(self, request: ModelRequest):
if not request.df.empty:
# print('Text corpus Encoding Process :-')
data = request.df
data = data.loc[~pd.isnull(data['content']),:]
data['content'] = data['content'].astype(str)

if data.empty or data['content'].isnull().any():
return 'There are nonzero null rows'

else :
text_corpus = data.loc[:,'content'].to_list()

if not text_corpus:
corpus_embeddings = self.model.encode(
[[corpus_instruction, text] for text in text_corpus],
show_progress_bar=False,
batch_size=32,
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
)
data['embeddings'] = corpus_embeddings.tolist()
csv_string = data.to_csv(index=False)
else:
return 'There are nonzero null rows'


text_corpus = data.loc[:,'content'].to_list()
corpus_embeddings = self.model.encode(
[[corpus_instruction, text] for text in text_corpus],
show_progress_bar=False,
batch_size=32,
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
)
data['embeddings'] = corpus_embeddings.tolist()
csv_string = data.to_csv(index=False)

return str(csv_string)
3 changes: 1 addition & 2 deletions src/embeddings/instructor/local/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@


class ModelRequest():
def __init__(self, query=None, df = pd.DataFrame(), query_type = None):
def __init__(self, query=None, df = pd.DataFrame()):
# Url to download csv file
self.query = query # String
self.query_type = query_type
self.df = df

def to_json(self):
Expand Down
4 changes: 2 additions & 2 deletions src/embeddings/instructor/local/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
torch==2.0.1 --index-url https://download.pytorch.org/whl/cpu
torch
quart
aiohttp
InstructorEmbedding
wget
pandas
tqdm
sentence_transformers
sentence-transformers==2.2.2
1 change: 0 additions & 1 deletion src/embeddings/instructor_gpu/README.md

This file was deleted.

15 changes: 0 additions & 15 deletions src/embeddings/instructor_gpu/local/Dockerfile

This file was deleted.

18 changes: 0 additions & 18 deletions src/embeddings/instructor_gpu/local/README.md

This file was deleted.

2 changes: 0 additions & 2 deletions src/embeddings/instructor_gpu/local/__init__.py

This file was deleted.

37 changes: 0 additions & 37 deletions src/embeddings/instructor_gpu/local/api.py

This file was deleted.

Loading

0 comments on commit 42c7ce1

Please sign in to comment.