diff --git a/Package.resolved b/Package.resolved index f125137..2215a3a 100644 --- a/Package.resolved +++ b/Package.resolved @@ -9,6 +9,15 @@ "version" : "1.20.1" } }, + { + "identity" : "feedkit", + "kind" : "remoteSourceControl", + "location" : "https://github.com/nmdias/FeedKit", + "state" : { + "revision" : "68493a33d862c33c9a9f67ec729b3b7df1b20ade", + "version" : "9.1.2" + } + }, { "identity" : "functions-swift", "kind" : "remoteSourceControl", @@ -77,8 +86,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/buhe/openai-kit", "state" : { - "revision" : "ead156fa2e25e4e469cd837fda08836fb880a834", - "version" : "1.8.3" + "revision" : "f0c785d22dd14f71a42ad83cedbbddc02591aadb", + "version" : "1.8.5" } }, { @@ -102,10 +111,10 @@ { "identity" : "similarity-search-kit", "kind" : "remoteSourceControl", - "location" : "https://github.com/ZachNagengast/similarity-search-kit.git", + "location" : "https://github.com/buhe/similarity-search-kit", "state" : { - "revision" : "6d78d3051aa35f5701af34f9f49ab793a97c6b76", - "version" : "0.0.11" + "revision" : "39bbd0b5c2662728792ee9be132212e5fd4a49e2", + "version" : "0.0.16" } }, { @@ -149,8 +158,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-collections.git", "state" : { - "revision" : "d029d9d39c87bed85b1c50adee7c41795261a192", - "version" : "1.0.6" + "revision" : "94cf62b3ba8d4bed62680a282d4c25f9c63c2efb", + "version" : "1.1.0" } }, { @@ -266,8 +275,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/buhe/SwiftyNotion", "state" : { - "revision" : "8ff884579370c0ebbc81ae1d6b8e9b62076d25b7", - "version" : "0.1.2" + "revision" : "61af40905bfd5eb8cad454e02095dfbdea0498cb", + "version" : "0.1.3" } }, { diff --git a/Package.swift b/Package.swift index f77429d..26fa2e3 100644 --- a/Package.swift +++ b/Package.swift @@ -18,16 +18,17 @@ let package = Package( targets: ["LangChain"]), ], dependencies: [ - .package(url: "https://github.com/buhe/openai-kit", .upToNextMajor(from: "1.8.3")), + .package(url: "https://github.com/buhe/openai-kit", .upToNextMajor(from: "1.8.5")), .package(url: "https://github.com/supabase-community/supabase-swift", .upToNextMajor(from: "0.2.1")), .package(url: "https://github.com/SwiftyJSON/SwiftyJSON", .upToNextMajor(from: "5.0.1")), .package(url: "https://github.com/drmohundro/SWXMLHash", .upToNextMajor(from: "7.0.2")), .package(url: "https://github.com/scinfu/SwiftSoup", .upToNextMajor(from: "2.6.1")), .package(url: "https://github.com/juyan/swift-filestore", .upToNextMajor(from: "0.5.0")), - .package(url: "https://github.com/ZachNagengast/similarity-search-kit.git", from: "0.0.11"), + .package(url: "https://github.com/buhe/similarity-search-kit", from: "0.0.16"), .package(url: "https://github.com/google/generative-ai-swift", .upToNextMajor(from: "0.4.4")), - .package(url: "https://github.com/buhe/SwiftyNotion", .upToNextMajor(from: "0.1.2")), .package(url: "https://github.com/buhe/llmfarm_core.swift", .branch("langchain")), + .package(url: "https://github.com/buhe/SwiftyNotion", .upToNextMajor(from: "0.1.3")), + .package(url: "https://github.com/nmdias/FeedKit", .upToNextMajor(from: "9.1.2")), ], targets: [ // Targets are the basic building blocks of a package, defining a module or a test suite. @@ -42,9 +43,11 @@ let package = Package( .product(name: "SwiftSoup", package: "SwiftSoup"), .product(name: "SwiftFileStore", package: "swift-filestore"), .product(name: "SimilaritySearchKit", package: "similarity-search-kit", condition: .when(platforms: [.macOS, .iOS, .visionOS])), +// .product(name: "SimilaritySearchKitDistilbert", package: "similarity-search-kit", condition: .when(platforms: [.macOS, .iOS, .visionOS])), .product(name: "GoogleGenerativeAI", package: "generative-ai-swift"), .product(name: "SwiftyNotion", package: "SwiftyNotion"), .product(name: "llmfarm_core", package: "llmfarm_core.swift"), + .product(name: "FeedKit", package: "FeedKit"), ] ), diff --git a/README.md b/README.md index 35f6a40..bbe3fa1 100644 --- a/README.md +++ b/README.md @@ -402,6 +402,7 @@ Task(priority: .background) { - [x] FileStore - Embedding - [x] OpenAI + - [ ] Distilbert - Chain - [x] Base - [x] LLM @@ -441,6 +442,7 @@ Task(priority: .background) { - [x] ImageOCRLoader - [x] AudioLoader - [x] NotionLoader + - [x] RSSLoader - OutputParser - [x] MRKLOutputParser - [x] ListOutputParser diff --git a/Sources/LangChain/document_loaders/BaseLoader.swift b/Sources/LangChain/document_loaders/BaseLoader.swift index d5e0d54..dd1b97b 100644 --- a/Sources/LangChain/document_loaders/BaseLoader.swift +++ b/Sources/LangChain/document_loaders/BaseLoader.swift @@ -6,9 +6,16 @@ // import Foundation -public struct Document { +public struct Document: Equatable { + public init(page_content: String, metadata: [String : String]) { + self.page_content = page_content + self.metadata = metadata + } public let page_content: String public var metadata: [String: String] + public static func == (lhs: Document, rhs: Document) -> Bool { + return lhs.page_content == rhs.page_content + } } public class BaseLoader { diff --git a/Sources/LangChain/document_loaders/RSSLoader.swift b/Sources/LangChain/document_loaders/RSSLoader.swift new file mode 100644 index 0000000..619257e --- /dev/null +++ b/Sources/LangChain/document_loaders/RSSLoader.swift @@ -0,0 +1,54 @@ +// +// File.swift +// +// +// Created by 顾艳华 on 2/10/24. +// + +import Foundation +import FeedKit + +public class RSSLoader: BaseLoader { + let url: String + + public init(url: String, callbacks: [BaseCallbackHandler] = []) { + self.url = url + super.init(callbacks: callbacks) + } + public override func _load() async throws -> [Document] { + let feedURL = URL(string: url)! + let parser = FeedParser(URL: feedURL) + let result = parser.parse() + switch result { + case .success(let feed): + + // Grab the parsed feed directly as an optional rss, atom or json feed object + switch feed { + case let .atom(feed): + var content = [Document]() + for f in feed.entries ?? [] { + content.append(Document(page_content: f.title ?? "", metadata: [:])) + } + return content + case let .rss(feed): + var content = [Document]() + for f in feed.items ?? [] { + content.append(Document(page_content: f.title ?? "", metadata: [:])) + } + return content + case let .json(feed): + var content = [Document]() + for f in feed.items ?? [] { + content.append(Document(page_content: f.title ?? "", metadata: [:])) + } + return content + } + + + case .failure(let error): + print(error) + return [] + } + + } +} diff --git a/Sources/LangChain/embeddings/Distilbert.swift b/Sources/LangChain/embeddings/Distilbert.swift new file mode 100644 index 0000000..0071898 --- /dev/null +++ b/Sources/LangChain/embeddings/Distilbert.swift @@ -0,0 +1,22 @@ +//// +//// File.swift +//// +//// +//// Created by 顾艳华 on 2/11/24. +//// +// +//import Foundation +//import SimilaritySearchKitDistilbert +// +//@available(macOS 13.0, *) +//public struct Distilbert: Embeddings { +// let n = DistilbertEmbeddings() +// public init() { +// +// } +// +// +// public func embedQuery(text: String) async -> [Float] { +// await n.encode(sentence: text)! +// } +//} diff --git a/Sources/LangChain/embeddings/OpenAIEmbeddings.swift b/Sources/LangChain/embeddings/OpenAIEmbeddings.swift index fc89ba2..ad27c1b 100644 --- a/Sources/LangChain/embeddings/OpenAIEmbeddings.swift +++ b/Sources/LangChain/embeddings/OpenAIEmbeddings.swift @@ -11,8 +11,9 @@ import AsyncHTTPClient import OpenAIKit public struct OpenAIEmbeddings: Embeddings { - public init() { - + let session: URLSession + public init(session: URLSession = URLSession(configuration: .default)) { + self.session = session } // public func embedDocuments(texts: [String]) -> [[Float]] { @@ -20,21 +21,16 @@ public struct OpenAIEmbeddings: Embeddings { // } public func embedQuery(text: String) async -> [Float] { - let eventLoopGroup = ThreadManager.thread let env = Env.loadEnv() if let apiKey = env["OPENAI_API_KEY"] { let baseUrl = env["OPENAI_API_BASE"] ?? "api.openai.com" - let httpClient = HTTPClient(eventLoopGroupProvider: .shared(eventLoopGroup)) let configuration = Configuration(apiKey: apiKey, api: API(scheme: .https, host: baseUrl)) - let openAIClient = OpenAIKit.Client(httpClient: httpClient, configuration: configuration) - defer { - // it's important to shutdown the httpClient after all requests are done, even if one failed. See: https://github.com/swift-server/async-http-client - try? httpClient.syncShutdown() - } + let openAIClient = OpenAIKit.Client(session: session, configuration: configuration) + do { let embedding = try await openAIClient.embeddings.create(input: text) diff --git a/Sources/LangChain/retriever/MultiVectorRetriever.swift b/Sources/LangChain/retriever/MultiVectorRetriever.swift index e042bb3..2c2363c 100644 --- a/Sources/LangChain/retriever/MultiVectorRetriever.swift +++ b/Sources/LangChain/retriever/MultiVectorRetriever.swift @@ -17,7 +17,7 @@ public class MultiVectorRetriever: BaseRetriever { } public override func _get_relevant_documents(query: String) async throws -> [Document] { - let sub_docs = await self.vectorstore.similaritySearch(query: query, k: 1) + let sub_docs = await self.vectorstore.similaritySearch(query: query, k: 2) var ids: [String] = [] for d in sub_docs { ids.append(d.metadata[self.id_key]!) diff --git a/Sources/LangChain/retriever/ParentDocumentRetriever.swift b/Sources/LangChain/retriever/ParentDocumentRetriever.swift index 8434989..7d5ff8d 100644 --- a/Sources/LangChain/retriever/ParentDocumentRetriever.swift +++ b/Sources/LangChain/retriever/ParentDocumentRetriever.swift @@ -4,8 +4,9 @@ // // Created by 顾艳华 on 2023/11/17. // - +import CryptoKit import Foundation + public class ParentDocumentRetriever: MultiVectorRetriever { public init(child_splitter: TextSplitter, parent_splitter: TextSplitter? = nil, vectorstore: VectorStore, docstore: BaseStore) { self.child_splitter = child_splitter @@ -19,7 +20,10 @@ public class ParentDocumentRetriever: MultiVectorRetriever { let parent_splitter: TextSplitter? //The text splitter to use to create parent documents. //If none, then the parent documents will be the raw documents passed in. - public func add_documents(documents: [Document]) async { + public func add_documents(documents: [Document]) async -> [String] { + if documents.isEmpty { + return [] + } var parent_documents: [Document] if let p = self.parent_splitter { parent_documents = p.split_documents(documents: documents) @@ -42,6 +46,28 @@ public class ParentDocumentRetriever: MultiVectorRetriever { await self.vectorstore.add_documents(documents: docs) await self.docstore.mset(kvpairs: full_docs) print("🚀 End add sub document \(docs.count), main document \(full_docs.count)") + return doc_ids + } + + public func remove_documents(documents: [Document]) async { + if documents.isEmpty { + return + } + await self.docstore.mdelete(keys: documents.map {$0.metadata["id"]!}) + var all_sub_docs = [Document]() + for main_doc in documents { + let sub_docs = self.child_splitter.split_documents(documents: [main_doc]) + all_sub_docs.append(contentsOf: sub_docs) + } + print("🚀 Begin remove sub document \(all_sub_docs.count), main document \(documents.count)") + await self.vectorstore.remove_documents(sha256s: all_sub_docs.map {sha256(str: $0.page_content)}) + print("🚀 End remove sub document \(all_sub_docs.count), main document \(documents.count)") + } + + fileprivate func sha256(str: String) -> String { + let data = Data(str.utf8) + let hash = SHA256.hash(data: data) + return hash.compactMap { String(format: "%02x", $0) }.joined() } // def add_documents( // self, diff --git a/Sources/LangChain/schema/LocalFileStore.swift b/Sources/LangChain/schema/LocalFileStore.swift index bebbe22..e50f6c2 100644 --- a/Sources/LangChain/schema/LocalFileStore.swift +++ b/Sources/LangChain/schema/LocalFileStore.swift @@ -102,7 +102,7 @@ public class LocalFileStore: BaseStore { var allKeys: [String] = [] let allSHA = try await objectStore!.readAllKeys(namespace: STORE_NS) for sha in allSHA { - print("sha: \(sha)") +// print("sha: \(sha)") if sha == ".DS_Store" { continue } diff --git a/Sources/LangChain/utilities/TextSplitter.swift b/Sources/LangChain/utilities/TextSplitter.swift index 80be25a..97afa5d 100644 --- a/Sources/LangChain/utilities/TextSplitter.swift +++ b/Sources/LangChain/utilities/TextSplitter.swift @@ -92,7 +92,7 @@ public class TextSplitter { public func split_text(text: String) -> [String] { [] } - func split_documents(documents: [Document]) -> [Document] { + public func split_documents(documents: [Document]) -> [Document] { var new_documents: [Document] = [] for doc in documents { for chunk in self.split_text(text: doc.page_content){ diff --git a/Sources/LangChain/vectorstores/SimilaritySearchKit.swift b/Sources/LangChain/vectorstores/SimilaritySearchKit.swift index f48e87b..3495d92 100644 --- a/Sources/LangChain/vectorstores/SimilaritySearchKit.swift +++ b/Sources/LangChain/vectorstores/SimilaritySearchKit.swift @@ -9,6 +9,7 @@ import Foundation #if os(macOS) || os(iOS) || os(visionOS) import SimilaritySearchKit +import CryptoKit private struct LangChainEmbeddingBridge: EmbeddingsProtocol { @@ -32,7 +33,11 @@ private struct LangChainEmbeddingBridge: EmbeddingsProtocol { } let embeddings: Embeddings func encode(sentence: String) async -> [Float]? { - await embeddings.embedQuery(text: sentence) + let e = await embeddings.embedQuery(text: sentence) + if e.isEmpty { + print("⚠️\(sentence.prefix(100))") + } + return e } @@ -40,11 +45,22 @@ private struct LangChainEmbeddingBridge: EmbeddingsProtocol { public class SimilaritySearchKit: VectorStore { let vs: SimilarityIndex - public init(embeddings: Embeddings) async { - self.vs = await SimilarityIndex( + public init(embeddings: Embeddings, autoLoad: Bool = false) { + self.vs = SimilarityIndex( model: LangChainEmbeddingBridge(embeddings: embeddings), - metric: CosineSimilarity() + metric: DotProduct() ) + if #available(macOS 13.0, *) { + if #available(iOS 16.0, *) { + if autoLoad { + let _ = try? vs.loadIndex() + } else { + // Fallback on earlier versions + } + } + } else { + // Fallback on earlier versions + } } override func similaritySearch(query: String, k: Int) async -> [MatchedModel] { @@ -52,7 +68,23 @@ public class SimilaritySearchKit: VectorStore { } override func addText(text: String, metadata: [String: String]) async { - await vs.addItem(id: UUID().uuidString, text: text, metadata: metadata) + await vs.addItem(id: sha256(str: text), text: text, metadata: metadata) + } + + @available(iOS 16.0, *) + @available(macOS 13.0, *) + public func writeToFile() { + let _ = try? vs.saveIndex() + } + + override func removeText(sha256: String) async { + vs.removeItem(id: sha256) + } + + func sha256(str: String) -> String { + let data = Data(str.utf8) + let hash = SHA256.hash(data: data) + return hash.compactMap { String(format: "%02x", $0) }.joined() } } #endif diff --git a/Sources/LangChain/vectorstores/VectorStore.swift b/Sources/LangChain/vectorstores/VectorStore.swift index ef10ce3..774bfae 100644 --- a/Sources/LangChain/vectorstores/VectorStore.swift +++ b/Sources/LangChain/vectorstores/VectorStore.swift @@ -16,7 +16,9 @@ public class VectorStore { func addText(text: String, metadata: [String: String]) async { } - + func removeText(sha256: String) async { + + } func similaritySearch(query: String, k: Int) async -> [MatchedModel] { [] } @@ -29,6 +31,13 @@ public class VectorStore { } } + func remove_documents(sha256s: [String]) async { + await withTaskGroup(of: Void.self) { [self] group in + for sha256 in sha256s { + group.addTask { await self.removeText(sha256: sha256)} + } + } + } // def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]: // """Run more documents through the embeddings and add to the vectorstore. // @@ -43,9 +52,3 @@ public class VectorStore { // metadatas = [doc.metadata for doc in documents] // return self.add_texts(texts, metadatas, **kwargs) } - -public protocol VectorStoreByUser { - func addText(text: String, user_id: String, metadata: [String: String]) async - - func similaritySearch(query: String, k: Int, user_id: String) async -> [MatchedModel] -} diff --git a/Tests/LangChainTests/langchain_swiftTests.swift b/Tests/LangChainTests/langchain_swiftTests.swift index b2e8aa2..b6044b0 100644 --- a/Tests/LangChainTests/langchain_swiftTests.swift +++ b/Tests/LangChainTests/langchain_swiftTests.swift @@ -519,12 +519,15 @@ Action Input: the input to the action XCTAssertEqual(keys, ["2"]) } - func testNotionLoader() async throws { - let l = NotionLoader() - let d = await l.load() - print("hello") + func testSimilaritySearchKitSHA256() throws { + let vs = SimilaritySearchKit(embeddings: OpenAIEmbeddings()) + let originalString = "Hello, World!" + let hashedString = vs.sha256(str: originalString) + print("🚗\(hashedString)") + XCTAssertNotNil(hashedString) } + // // func testYoutubeHackClientList() async throws { // let eventLoopGroup = ThreadManager.thread