diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py index a55d6fc..0ddf36d 100644 --- a/crawl4ai/prompts.py +++ b/crawl4ai/prompts.py @@ -1,4 +1,4 @@ -PROMPT_EXTRACT_BLOCKS = """YHere is the URL of the webpage: +PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage: {URL} And here is the cleaned HTML content of that webpage: @@ -79,7 +79,7 @@ 2. For each block: a. Assign it an index based on its order in the content. b. Analyze the content and generate ONE semantic tag that describe what the block is about. - c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field. + c. Extract the text content, EXACTLY SAME AS THE GIVEN DATA, clean it up if needed, and store it as a list of strings in the "content" field. 3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content. diff --git a/crawl4ai/train.py b/crawl4ai/train.py index f7e7c1a..f9b67e6 100644 --- a/crawl4ai/train.py +++ b/crawl4ai/train.py @@ -7,7 +7,7 @@ def save_spacy_model_as_torch(nlp, model_dir="models/reuters"): # Extract the TextCategorizer component - textcat = nlp.get_pipe("textcat_multilabel") + textcat = nlp.get_pipe("textcat") # Convert the weights to a PyTorch state dictionary state_dict = {name: torch.tensor(param.data) for name, param in textcat.model.named_parameters()} @@ -24,74 +24,56 @@ def save_spacy_model_as_torch(nlp, model_dir="models/reuters"): print(f"Model weights and vocabulary saved to: {model_dir}") def extract_vocab(nlp): - # Extract vocabulary from the SpaCy model - vocab = {word: i for i, word in enumerate(nlp.vocab.strings)} + vocab = {word: i for i, word in enumerate(nlp.vocab.strings) if word.isalpha()} return vocab -nlp = spacy.load("models/reuters") -save_spacy_model_as_torch(nlp, model_dir="models") - def train_and_save_reuters_model(model_dir="models/reuters"): - # Ensure the Reuters corpus is downloaded nltk.download('reuters') nltk.download('punkt') if not reuters.fileids(): print("Reuters corpus not found.") return - # Load a blank English spaCy model nlp = spacy.blank("en") + textcat = nlp.add_pipe("textcat", config={"exclusive_classes": False, "architecture": "simple_cnn"}) - # Create a TextCategorizer with the ensemble model for multi-label classification - textcat = nlp.add_pipe("textcat_multilabel") - - # Add labels to text classifier for label in reuters.categories(): textcat.add_label(label) - # Prepare training data train_examples = [] for fileid in reuters.fileids(): categories = reuters.categories(fileid) text = reuters.raw(fileid) cats = {label: label in categories for label in reuters.categories()} - # Prepare spacy Example objects doc = nlp.make_doc(text) example = Example.from_dict(doc, {'cats': cats}) train_examples.append(example) - # Initialize the text categorizer with the example objects nlp.initialize(lambda: train_examples) - # Train the model random.seed(1) - spacy.util.fix_random_seed(1) - for i in range(5): # Adjust iterations for better accuracy + for i in range(5): random.shuffle(train_examples) losses = {} - # Create batches of data batches = spacy.util.minibatch(train_examples, size=8) for batch in batches: nlp.update(batch, drop=0.2, losses=losses) print(f"Losses at iteration {i}: {losses}") - # Save the trained model nlp.to_disk(model_dir) print(f"Model saved to: {model_dir}") def train_model(model_dir, additional_epochs=0): - # Load the model if it exists, otherwise start with a blank model try: nlp = spacy.load(model_dir) print("Model loaded from disk.") except IOError: print("No existing model found. Starting with a new model.") nlp = spacy.blank("en") - textcat = nlp.add_pipe("textcat_multilabel") + textcat = nlp.add_pipe("textcat", config={"exclusive_classes": False}) for label in reuters.categories(): textcat.add_label(label) - # Prepare training data train_examples = [] for fileid in reuters.fileids(): categories = reuters.categories(fileid) @@ -101,15 +83,12 @@ def train_model(model_dir, additional_epochs=0): example = Example.from_dict(doc, {'cats': cats}) train_examples.append(example) - # Initialize the model if it was newly created - if 'textcat_multilabel' not in nlp.pipe_names: + if 'textcat' not in nlp.pipe_names: nlp.initialize(lambda: train_examples) else: print("Continuing training with existing model.") - # Train the model random.seed(1) - spacy.util.fix_random_seed(1) num_epochs = 5 + additional_epochs for i in range(num_epochs): random.shuffle(train_examples) @@ -119,28 +98,21 @@ def train_model(model_dir, additional_epochs=0): nlp.update(batch, drop=0.2, losses=losses) print(f"Losses at iteration {i}: {losses}") - # Save the trained model nlp.to_disk(model_dir) print(f"Model saved to: {model_dir}") -def load_model_and_predict(model_dir, text, tok_k = 3): - # Load the trained model from the specified directory +def load_model_and_predict(model_dir, text, tok_k=3): nlp = spacy.load(model_dir) - - # Process the text with the loaded model doc = nlp(text) - - # gee top 3 categories top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] print(f"Top {tok_k} categories:") - - return top_categories + for category, score in top_categories: + print(f"{category}: {score:.4f}") + return top_categories if __name__ == "__main__": + model_directory = "models/reuters" train_and_save_reuters_model() - train_model("models/reuters", additional_epochs=5) - model_directory = "reuters_model_10" - print(reuters.categories()) + train_model(model_directory, additional_epochs=5) example_text = "Apple Inc. is reportedly buying a startup for $1 billion" - r =load_model_and_predict(model_directory, example_text) - print(r) \ No newline at end of file + load_model_and_predict(model_directory, example_text) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 71a36ae..be65171 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -131,7 +131,7 @@ def split_and_parse_json_objects(json_string): return parsed_objects, unparsed_segments def sanitize_html(html): - # Replace all weird and special characters with an empty string + # Replace all unwanted and special characters with an empty string sanitized_html = html # sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html) @@ -301,7 +301,7 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, if tag.name != 'img': tag.attrs = {} - # Extract all img tgas inti [{src: '', alt: ''}] + # Extract all img tags into [{src: '', alt: ''}] media = { 'images': [], 'videos': [], @@ -339,7 +339,7 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, img.decompose() - # Create a function that replace content of all"pre" tage with its inner text + # Create a function that replace content of all "pre" tag with its inner text def replace_pre_tags_with_text(node): for child in node.find_all('pre'): # set child inner html to its text @@ -502,7 +502,7 @@ def find_closest_parent_with_useful_text(tag): current_tag = tag while current_tag: current_tag = current_tag.parent - # Get the text content of the parent tag + # Get the text content from the parent tag if current_tag: text_content = current_tag.get_text(separator=' ',strip=True) # Check if the text content has at least word_count_threshold diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 7dea56c..26d2d1a 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -1,4 +1,5 @@ import os, time +import json os.environ["TOKENIZERS_PARALLELISM"] = "false" from pathlib import Path @@ -234,4 +235,4 @@ def process_html( extracted_content=extracted_content, success=True, error_message="", - ) \ No newline at end of file + )