diff --git a/README.md b/README.md index 262a74d..ef0a4f0 100644 --- a/README.md +++ b/README.md @@ -51,29 +51,18 @@ Refer to [input schema](.actor/input_schema.json) for details. - `datasetFields` - Array of datasetFields you want to save, e.g., `["url", "text", "metadata.title"]`. - `filePrefix` - Delete and create files using a filePrefix, streamlining vector store updates. - `fileIdsToDelete` - Delete specified file IDs from vector store as needed. -- `datasetId`: _[Debug]_ Dataset ID (when running Actor as standalone without integration). -- `keyValueStoreId`: _[Debug]_ Key Value Store ID (when running Actor as standalone without integration). +- `datasetId`: _[Debug]_ Apify's Dataset ID (when running Actor as standalone without integration). +- `keyValueStoreId`: _[Debug]_ Apify's Key Value Store ID (when running Actor as standalone without integration). - `saveInApifyKeyValueStore`: _[Debug]_ Save all created files in the Apify Key-Value Store to easily check and retrieve all files (this is typically used when debugging) ## Outputs This integration saves selected `datasetFields` from your Actor to the OpenAI Assistant and optionally to Actor Key Value Storage (useful for debugging). -## Want to talk to other devs or get help? - -Join our [developer community on Discord](https://discord.com/invite/jyEM2PRvMU) to connect with others and discuss this and other integrations. - -## Need data for your LLMs? - -Utilize the Apify platform to [gather data for your large language models](https://apify.com/data-for-generative-ai). -Our Actors can automatically ingest entire websites, such as customer documentation, knowledge bases, help centers, -forums, blog posts, and other information sources to train or prompt your LLMs. -Integrate Apify into your product and allow your customers to upload their content in minutes. - ## Save data from Website Content Crawler to OpenAI Vector Store To use this integration, you need an OpenAI account and an `OpenAI API KEY`. -Additionally, you need to create an OpenAI Vector Store (vectorStoreId). +Additionally, you need to create an OpenAI Vector Store (`vectorStoreId`). The Website Content Crawler can deeply crawl websites and save web page content to Apify's dataset. It also stores files such as PDFs, PPTXs, and DOCXs. @@ -121,3 +110,42 @@ The settings for the integration are as follows: "vectorStoreId": "YOUR-VECTOR-STORE-ID" } ``` + +### Save Amazon Products to OpenAI Vector Store + +You can also save Amazon products to the OpenAI Vector Store. +Again, you need to have an OpenAI account and an `OpenAI API KEY` with a created OpenAI Vector Store (`vectorStoreId`). + +To scrape Amazon products, you can use the [Amazon Product Scraper](https://apify.com/junglee/amazon-crawler) Actor. + +Let's say that you want to scrape "Apple Watch" and store all the scraped data in the OpenAI Assistant. +For the product URL `https://www.amazon.com/s?k=apple+watch`, the scraper can yield the following results (truncated for brevity): + +```json +[ + { + "title": "Apple Watch Ultra 2 [GPS + Cellular 49mm] Smartwatch with Rugged Titanium Case ....", + "asin": "B0CSVGK51Y", + "brand": "Apple", + "stars": 4.7, + "reviewsCount": 357, + "thumbnailImage": "https://m.media-amazon.com/images/I/81pjcQFaDJL.__AC_SY445_SX342_QL70_FMwebp_.jpg", + "price": { + "value": 794, + "currency": "$" + }, + "url": "https://www.amazon.com/dp/B0CSVGK51Y" + } +] +``` + +You can easily save the data to the OpenAI Vector Store by creating an integration (in the Amazon Product Scraper integration section) and specifying the fields you want to save: + +```json +{ + "assistantId": "YOUR-ASSISTANT-ID", + "datasetFields": ["title", "brand", "stars", "reviewsCount", "thumbnailImage", "price.value", "price.currency", "url"], + "openaiApiKey": "YOUR-OPENAI-API-KEY", + "vectorStoreId": "YOUR-VECTOR-STORE-ID" +} +``` diff --git a/src/main.py b/src/main.py index 14443f7..cbd35b3 100644 --- a/src/main.py +++ b/src/main.py @@ -4,6 +4,7 @@ from io import BytesIO from typing import TYPE_CHECKING +import openai import tiktoken from apify import Actor from apify_client import ApifyClientAsync @@ -58,13 +59,19 @@ async def main() -> None: async def check_inputs(client: AsyncOpenAI, actor_input: ActorInput, payload: dict) -> Assistant | None: """Check that provided input exists at OpenAI or at Apify.""" - if not (await client.beta.vector_stores.retrieve(actor_input.vectorStoreId)): + try: + await client.beta.vector_stores.retrieve(actor_input.vectorStoreId) + except openai.NotFoundError: msg = ( - f"Unable to find the Vector Store with the ID: {actor_input.vectorStoreId} on OpenAI. Please verify that the Vector Store has " - f"been correctly created and that the `vectorStoreId` provided is accurate." + f"Unable to find the OpenAI Vector Store with the ID: {actor_input.vectorStoreId}. Please verify that the Vector Store has " + "been correctly created and that the `vectorStoreId` provided is accurate." ) Actor.log.error(msg) await Actor.fail(status_message=msg) + except openai.AuthenticationError: + msg = "The OpenAI API Key provided is invalid. Please verify that the `OPENAI_API_KEY` is correctly set." + Actor.log.error(msg) + await Actor.fail(status_message=msg) assistant = None if actor_input.assistantId and not (assistant := await client.beta.assistants.retrieve(actor_input.assistantId)): @@ -79,7 +86,8 @@ async def check_inputs(client: AsyncOpenAI, actor_input: ActorInput, payload: di if not (dataset_id or key_value_store_id): msg = ( - "The `datasetId` or `keyValueStoreId` are not provided. There are two ways to specify the `datasetId` or `keyValueStoreId`:" + "The Apify's `datasetId` or Apify's `keyValueStoreId` are not provided. " + "There are two ways to specify the `datasetId` or `keyValueStoreId`: " "1. Automatic Input: If this integration is used with other Actors, such as the Website Content Crawler, the variables should be " "automatically passed in the 'payload'. Please check the `Input` payload to ensure that they are included." "2. Manual Input: If you are running this Actor independently, you can to manually specify the 'datasetId' or `keyValueStoreId. " @@ -167,7 +175,7 @@ async def create_file(client: AsyncOpenAI, filename: str, data: bytes | BytesIO) await Actor.push_data({"filename": filename, "file_id": file.id, "status": "created"}) return file # noqa: TRY300 except Exception as e: - Actor.log.exception(e) + Actor.log.error("Failed to create OpenAI file: %s, error: %s", filename, e) return None