Skip to content

Commit

Permalink
some rag refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
evilsocket committed Jun 26, 2024
1 parent ef3be3e commit ef6b94c
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 24 deletions.
4 changes: 2 additions & 2 deletions src/agent/namespaces/rag/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@ impl Action for Search {
if !docs.is_empty() {
println!("\n {} results in {:?}", docs.len(), start.elapsed());
for (doc, score) in &docs {
println!(" * {} ({})", &doc.name, score);
println!(" * {} ({})", doc.get_path(), score);
}
println!();

Ok(Some(format!(
"Here is some supporting information:\n\n{}",
docs.iter()
.map(|(doc, _)| doc.data.clone())
.map(|(doc, _)| doc.get_data().to_string())
.collect::<Vec<String>>()
.join("\n")
)))
Expand Down
31 changes: 31 additions & 0 deletions src/agent/rag/document.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
use std::path::PathBuf;

use anyhow::Result;

#[derive(Clone, Debug)]
pub(crate) struct Document {
path: String,
data: String,
}

impl Document {
pub fn from_text_file(path: &PathBuf) -> Result<Self> {
let path = std::fs::canonicalize(path.display().to_string())?
.display()
.to_string();
let data = std::fs::read_to_string(&path)?;
Ok(Self { path, data })
}

pub fn get_path(&self) -> &str {
&self.path
}

pub fn get_data(&self) -> &str {
&self.data
}

pub fn get_byte_size(&self) -> usize {
self.data.as_bytes().len()
}
}
8 changes: 2 additions & 6 deletions src/agent/rag/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ use naive::NaiveVectorStore;
use serde::{Deserialize, Serialize};

use super::generator::Client;
pub(crate) use document::Document;

pub(crate) mod document;
mod metrics;
pub(crate) mod naive;

Expand All @@ -15,12 +17,6 @@ pub struct Configuration {
pub path: String,
}

#[derive(Clone, Debug)]
pub struct Document {
pub name: String,
pub data: String,
}

#[async_trait]
pub trait VectorStore: Send {
#[allow(clippy::borrowed_box)]
Expand Down
27 changes: 11 additions & 16 deletions src/agent/rag/naive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,44 +43,39 @@ impl VectorStore for NaiveVectorStore {
let expr = format!("{}/**/*.txt", path);

for path in (glob(&expr)?).flatten() {
let doc_name = path.display();
let doc = Document {
name: doc_name.to_string(),
data: std::fs::read_to_string(&path)?,
};
let doc = Document::from_text_file(&path)?;
if let Err(err) = store.add(doc).await {
eprintln!("ERROR storing {}: {}", doc_name, err);
eprintln!("ERROR storing {}: {}", path.display(), err);
}
}

Ok(store)
}

async fn add(&mut self, document: Document) -> Result<()> {
if self.documents.contains_key(&document.name) {
let doc_path = document.get_path().to_string();

if self.documents.contains_key(&doc_path) {
return Err(anyhow!(
"document with name '{}' already indexed",
&document.name
&doc_path
));
}

// TODO: add chunking
let data_size = document.data.as_bytes().len();

print!(
"[{}] indexing document '{}' ({} bytes) ...",
"rag".bold(),
&document.name,
data_size
&doc_path,
document.get_byte_size()
);

let start = Instant::now();
let doc_name = document.name.to_string();
let embeddings = self.embedder.embeddings(&document.data).await?;
let embeddings: Vec<f64> = self.embedder.embeddings(document.get_data()).await?;
let size = embeddings.len();

self.documents.insert(doc_name.to_string(), document);
self.embeddings.insert(doc_name, embeddings);
self.documents.insert(doc_path.to_string(), document);
self.embeddings.insert(doc_path, embeddings);

println!(" time={:?} embedding_size={}", start.elapsed(), size);

Expand Down

0 comments on commit ef6b94c

Please sign in to comment.