From 167796725a8bd4933eadda7b8bf84edac164bba7 Mon Sep 17 00:00:00 2001 From: Vignesh Skanda Date: Fri, 4 Oct 2024 20:12:46 +0530 Subject: [PATCH 1/4] Update web_crawler.py --- crawl4ai/web_crawler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 7dea56c..26d2d1a 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -1,4 +1,5 @@ import os, time +import json os.environ["TOKENIZERS_PARALLELISM"] = "false" from pathlib import Path @@ -234,4 +235,4 @@ def process_html( extracted_content=extracted_content, success=True, error_message="", - ) \ No newline at end of file + ) From 7a4130810c9caba4242bdda7959d8db97fa87260 Mon Sep 17 00:00:00 2001 From: Vignesh Skanda Date: Fri, 4 Oct 2024 20:20:50 +0530 Subject: [PATCH 2/4] Update utils.py --- crawl4ai/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 71a36ae..be65171 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -131,7 +131,7 @@ def split_and_parse_json_objects(json_string): return parsed_objects, unparsed_segments def sanitize_html(html): - # Replace all weird and special characters with an empty string + # Replace all unwanted and special characters with an empty string sanitized_html = html # sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html) @@ -301,7 +301,7 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, if tag.name != 'img': tag.attrs = {} - # Extract all img tgas inti [{src: '', alt: ''}] + # Extract all img tags into [{src: '', alt: ''}] media = { 'images': [], 'videos': [], @@ -339,7 +339,7 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, img.decompose() - # Create a function that replace content of all"pre" tage with its inner text + # Create a function that replace content of all "pre" tag with its inner text def replace_pre_tags_with_text(node): for child in node.find_all('pre'): # set child inner html to its text @@ -502,7 +502,7 @@ def find_closest_parent_with_useful_text(tag): current_tag = tag while current_tag: current_tag = current_tag.parent - # Get the text content of the parent tag + # Get the text content from the parent tag if current_tag: text_content = current_tag.get_text(separator=' ',strip=True) # Check if the text content has at least word_count_threshold From 72e605ca9fb08d1469cbd3d7a9eceb1aac568c0f Mon Sep 17 00:00:00 2001 From: Vignesh Skanda Date: Fri, 4 Oct 2024 20:24:47 +0530 Subject: [PATCH 3/4] Update train.py error: Incorrect pipe name (textcat_multilabel): By default, SpaCy's TextCategorizer pipeline is called textcat, not textcat_multilabel. You need to ensure that you have installed and are using a specific multi-label classification pipeline or that you're setting it up correctly. If you're aiming to use multi-label classification, consider handling it via textcat and properly setting multi-label categorization in your training data. changes that i have made to the code: textcat: Replaced textcat_multilabel with textcat. Fixed random seed: Removed spacy.util.fix_random_seed and just used random.seed. Vocabulary filtering: Added isalpha() check for vocabulary words to ensure only alphabetic words are saved. Ensure you have the spacy-textcat component installed for multi-label text categorization. --- crawl4ai/train.py | 54 ++++++++++++----------------------------------- 1 file changed, 13 insertions(+), 41 deletions(-) diff --git a/crawl4ai/train.py b/crawl4ai/train.py index f7e7c1a..f9b67e6 100644 --- a/crawl4ai/train.py +++ b/crawl4ai/train.py @@ -7,7 +7,7 @@ def save_spacy_model_as_torch(nlp, model_dir="models/reuters"): # Extract the TextCategorizer component - textcat = nlp.get_pipe("textcat_multilabel") + textcat = nlp.get_pipe("textcat") # Convert the weights to a PyTorch state dictionary state_dict = {name: torch.tensor(param.data) for name, param in textcat.model.named_parameters()} @@ -24,74 +24,56 @@ def save_spacy_model_as_torch(nlp, model_dir="models/reuters"): print(f"Model weights and vocabulary saved to: {model_dir}") def extract_vocab(nlp): - # Extract vocabulary from the SpaCy model - vocab = {word: i for i, word in enumerate(nlp.vocab.strings)} + vocab = {word: i for i, word in enumerate(nlp.vocab.strings) if word.isalpha()} return vocab -nlp = spacy.load("models/reuters") -save_spacy_model_as_torch(nlp, model_dir="models") - def train_and_save_reuters_model(model_dir="models/reuters"): - # Ensure the Reuters corpus is downloaded nltk.download('reuters') nltk.download('punkt') if not reuters.fileids(): print("Reuters corpus not found.") return - # Load a blank English spaCy model nlp = spacy.blank("en") + textcat = nlp.add_pipe("textcat", config={"exclusive_classes": False, "architecture": "simple_cnn"}) - # Create a TextCategorizer with the ensemble model for multi-label classification - textcat = nlp.add_pipe("textcat_multilabel") - - # Add labels to text classifier for label in reuters.categories(): textcat.add_label(label) - # Prepare training data train_examples = [] for fileid in reuters.fileids(): categories = reuters.categories(fileid) text = reuters.raw(fileid) cats = {label: label in categories for label in reuters.categories()} - # Prepare spacy Example objects doc = nlp.make_doc(text) example = Example.from_dict(doc, {'cats': cats}) train_examples.append(example) - # Initialize the text categorizer with the example objects nlp.initialize(lambda: train_examples) - # Train the model random.seed(1) - spacy.util.fix_random_seed(1) - for i in range(5): # Adjust iterations for better accuracy + for i in range(5): random.shuffle(train_examples) losses = {} - # Create batches of data batches = spacy.util.minibatch(train_examples, size=8) for batch in batches: nlp.update(batch, drop=0.2, losses=losses) print(f"Losses at iteration {i}: {losses}") - # Save the trained model nlp.to_disk(model_dir) print(f"Model saved to: {model_dir}") def train_model(model_dir, additional_epochs=0): - # Load the model if it exists, otherwise start with a blank model try: nlp = spacy.load(model_dir) print("Model loaded from disk.") except IOError: print("No existing model found. Starting with a new model.") nlp = spacy.blank("en") - textcat = nlp.add_pipe("textcat_multilabel") + textcat = nlp.add_pipe("textcat", config={"exclusive_classes": False}) for label in reuters.categories(): textcat.add_label(label) - # Prepare training data train_examples = [] for fileid in reuters.fileids(): categories = reuters.categories(fileid) @@ -101,15 +83,12 @@ def train_model(model_dir, additional_epochs=0): example = Example.from_dict(doc, {'cats': cats}) train_examples.append(example) - # Initialize the model if it was newly created - if 'textcat_multilabel' not in nlp.pipe_names: + if 'textcat' not in nlp.pipe_names: nlp.initialize(lambda: train_examples) else: print("Continuing training with existing model.") - # Train the model random.seed(1) - spacy.util.fix_random_seed(1) num_epochs = 5 + additional_epochs for i in range(num_epochs): random.shuffle(train_examples) @@ -119,28 +98,21 @@ def train_model(model_dir, additional_epochs=0): nlp.update(batch, drop=0.2, losses=losses) print(f"Losses at iteration {i}: {losses}") - # Save the trained model nlp.to_disk(model_dir) print(f"Model saved to: {model_dir}") -def load_model_and_predict(model_dir, text, tok_k = 3): - # Load the trained model from the specified directory +def load_model_and_predict(model_dir, text, tok_k=3): nlp = spacy.load(model_dir) - - # Process the text with the loaded model doc = nlp(text) - - # gee top 3 categories top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] print(f"Top {tok_k} categories:") - - return top_categories + for category, score in top_categories: + print(f"{category}: {score:.4f}") + return top_categories if __name__ == "__main__": + model_directory = "models/reuters" train_and_save_reuters_model() - train_model("models/reuters", additional_epochs=5) - model_directory = "reuters_model_10" - print(reuters.categories()) + train_model(model_directory, additional_epochs=5) example_text = "Apple Inc. is reportedly buying a startup for $1 billion" - r =load_model_and_predict(model_directory, example_text) - print(r) \ No newline at end of file + load_model_and_predict(model_directory, example_text) From f6ecb6fd953400057ac2ce844f22d265d018f70d Mon Sep 17 00:00:00 2001 From: Vignesh Skanda Date: Fri, 4 Oct 2024 20:28:02 +0530 Subject: [PATCH 4/4] Update prompts.py --- crawl4ai/prompts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py index a55d6fc..0ddf36d 100644 --- a/crawl4ai/prompts.py +++ b/crawl4ai/prompts.py @@ -1,4 +1,4 @@ -PROMPT_EXTRACT_BLOCKS = """YHere is the URL of the webpage: +PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage: {URL} And here is the cleaned HTML content of that webpage: @@ -79,7 +79,7 @@ 2. For each block: a. Assign it an index based on its order in the content. b. Analyze the content and generate ONE semantic tag that describe what the block is about. - c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field. + c. Extract the text content, EXACTLY SAME AS THE GIVEN DATA, clean it up if needed, and store it as a list of strings in the "content" field. 3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.