-
Notifications
You must be signed in to change notification settings - Fork 817
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
potter-potter
committed
Nov 4, 2023
1 parent
67d45af
commit ed8a929
Showing
1 changed file
with
71 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,79 @@ | ||
import json | ||
import chromadb | ||
chroma_client = chromadb.Client() | ||
|
||
collection = chroma_client.create_collection(name="my_collection") | ||
from chromadb.errors import InvalidDimensionException | ||
|
||
import flatdict | ||
|
||
from unstructured.staging.base import flatten_dict | ||
|
||
|
||
|
||
# try: | ||
# docsearch = chromadb.from_documents(documents=..., embedding=...) | ||
# except InvalidDimensionException: | ||
# chromadb().delete_collection() | ||
# docsearch = chromadb.from_documents(documents=..., embedding=...) | ||
|
||
def flatten_values(value, seperator="\n", no_value_str=""): | ||
"""Flattens list or dict objects. Joins each value or item with | ||
the seperator character. Keys are not included in the joined string. | ||
When a dict value or a list item is None, no_value_str is used to | ||
represent that value / item.""" | ||
if value is None: | ||
return no_value_str | ||
|
||
if isinstance(value, list): | ||
flattened_values = [flatten_values(item, seperator) for item in value] | ||
return seperator.join(flattened_values) | ||
|
||
elif isinstance(value, dict): | ||
flattened_values = [flatten_values(item, seperator) for item in value.values()] | ||
return seperator.join(flattened_values) | ||
|
||
else: | ||
return str(value) | ||
|
||
|
||
e="/Users/davidpotter/Documents/Unstructured/sessions/unstructured/test_unstructured_ingest/workdir/s3-pinecone-dest/embedded/42d06000044204b602333f8d3a0f592d.json" | ||
with open(e, "r") as read_content: | ||
ed=(json.load(read_content)) | ||
|
||
|
||
breakpoint() | ||
chroma_client = chromadb.PersistentClient(path="/Users/davidpotter/Documents/Unstructured/sessions/unstructured/test_unstructured_ingest/chromadb/") | ||
|
||
collection = chroma_client.get_or_create_collection(name="my_collection") | ||
|
||
# collection.add( | ||
# documents=["This is a document", "This is another document"], | ||
# metadatas=[{"source": "my_source"}, {"source": "my_source"}], | ||
# ids=["id1", "id2"] | ||
# ) | ||
# print([x.get("embeddings") for x in ed]) | ||
# This worked?? | ||
md={"data_source":{"url":"example-docs/book-war-and-peace-1p.txt","date_created":"2023-10-2510:05:44.916316","date_modified":"2023-10-2510:05:44.916316"}} | ||
print(flatdict.FlatterDict(md,delimiter=".")) | ||
breakpoint() | ||
metadatas=[md for x in ed], | ||
print(metadatas) | ||
collection.add( | ||
documents=["This is a document", "This is another document"], | ||
metadatas=[{"source": "my_source"}, {"source": "my_source"}], | ||
ids=["id1", "id2"] | ||
documents=[x.get("text") for x in ed], | ||
embeddings=[x.get("embeddings") for x in ed], | ||
# metadatas=[x.get("metadata") for x in ed], | ||
metadatas=[dict(flatdict.FlatterDict(x.get("metadata"),delimiter=".")) for x in ed], | ||
ids=[x.get("element_id") for x in ed] | ||
) | ||
# breakpoint() | ||
|
||
# print(collection.get(include=["embeddings", "documents", "metadatas"])) | ||
|
||
results = collection.query( | ||
query_texts=["This is a query document"], | ||
n_results=2 | ||
) | ||
# results = collection.query( | ||
# query_texts=["who knows everything?"], | ||
# n_results=1 | ||
# ) | ||
|
||
# print(results) | ||
print(collection.count()) | ||
|
||
print(results) | ||
chroma_client.delete_collection(name="my_collection") |