diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py
index a55d6fc..0ddf36d 100644
--- a/crawl4ai/prompts.py
+++ b/crawl4ai/prompts.py
@@ -1,4 +1,4 @@
-PROMPT_EXTRACT_BLOCKS = """YHere is the URL of the webpage:
+PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage:
 <url>{URL}</url>
 
 And here is the cleaned HTML content of that webpage:
@@ -79,7 +79,7 @@
 2. For each block:
    a. Assign it an index based on its order in the content.
    b. Analyze the content and generate ONE semantic tag that describe what the block is about.
-   c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
+   c. Extract the text content, EXACTLY SAME AS THE GIVEN DATA, clean it up if needed, and store it as a list of strings in the "content" field.
 
 3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
 
diff --git a/crawl4ai/train.py b/crawl4ai/train.py
index f7e7c1a..f9b67e6 100644
--- a/crawl4ai/train.py
+++ b/crawl4ai/train.py
@@ -7,7 +7,7 @@
 
 def save_spacy_model_as_torch(nlp, model_dir="models/reuters"):
     # Extract the TextCategorizer component
-    textcat = nlp.get_pipe("textcat_multilabel")
+    textcat = nlp.get_pipe("textcat")
 
     # Convert the weights to a PyTorch state dictionary
     state_dict = {name: torch.tensor(param.data) for name, param in textcat.model.named_parameters()}
@@ -24,74 +24,56 @@ def save_spacy_model_as_torch(nlp, model_dir="models/reuters"):
     print(f"Model weights and vocabulary saved to: {model_dir}")
 
 def extract_vocab(nlp):
-    # Extract vocabulary from the SpaCy model
-    vocab = {word: i for i, word in enumerate(nlp.vocab.strings)}
+    vocab = {word: i for i, word in enumerate(nlp.vocab.strings) if word.isalpha()}
     return vocab
 
-nlp = spacy.load("models/reuters")
-save_spacy_model_as_torch(nlp, model_dir="models")
-
 def train_and_save_reuters_model(model_dir="models/reuters"):
-    # Ensure the Reuters corpus is downloaded
     nltk.download('reuters')
     nltk.download('punkt')
     if not reuters.fileids():
         print("Reuters corpus not found.")
         return
 
-    # Load a blank English spaCy model
     nlp = spacy.blank("en")
+    textcat = nlp.add_pipe("textcat", config={"exclusive_classes": False, "architecture": "simple_cnn"})
 
-    # Create a TextCategorizer with the ensemble model for multi-label classification
-    textcat = nlp.add_pipe("textcat_multilabel")
-
-    # Add labels to text classifier
     for label in reuters.categories():
         textcat.add_label(label)
 
-    # Prepare training data
     train_examples = []
     for fileid in reuters.fileids():
         categories = reuters.categories(fileid)
         text = reuters.raw(fileid)
         cats = {label: label in categories for label in reuters.categories()}
-        # Prepare spacy Example objects
         doc = nlp.make_doc(text)
         example = Example.from_dict(doc, {'cats': cats})
         train_examples.append(example)
 
-    # Initialize the text categorizer with the example objects
     nlp.initialize(lambda: train_examples)
 
-    # Train the model
     random.seed(1)
-    spacy.util.fix_random_seed(1)
-    for i in range(5):  # Adjust iterations for better accuracy
+    for i in range(5):
         random.shuffle(train_examples)
         losses = {}
-        # Create batches of data
         batches = spacy.util.minibatch(train_examples, size=8)
         for batch in batches:
             nlp.update(batch, drop=0.2, losses=losses)
         print(f"Losses at iteration {i}: {losses}")
 
-    # Save the trained model
     nlp.to_disk(model_dir)
     print(f"Model saved to: {model_dir}")
 
 def train_model(model_dir, additional_epochs=0):
-    # Load the model if it exists, otherwise start with a blank model
     try:
         nlp = spacy.load(model_dir)
         print("Model loaded from disk.")
     except IOError:
         print("No existing model found. Starting with a new model.")
         nlp = spacy.blank("en")
-        textcat = nlp.add_pipe("textcat_multilabel")
+        textcat = nlp.add_pipe("textcat", config={"exclusive_classes": False})
         for label in reuters.categories():
             textcat.add_label(label)
 
-    # Prepare training data
     train_examples = []
     for fileid in reuters.fileids():
         categories = reuters.categories(fileid)
@@ -101,15 +83,12 @@ def train_model(model_dir, additional_epochs=0):
         example = Example.from_dict(doc, {'cats': cats})
         train_examples.append(example)
 
-    # Initialize the model if it was newly created
-    if 'textcat_multilabel' not in nlp.pipe_names:
+    if 'textcat' not in nlp.pipe_names:
         nlp.initialize(lambda: train_examples)
     else:
         print("Continuing training with existing model.")
 
-    # Train the model
     random.seed(1)
-    spacy.util.fix_random_seed(1)
     num_epochs = 5 + additional_epochs
     for i in range(num_epochs):
         random.shuffle(train_examples)
@@ -119,28 +98,21 @@ def train_model(model_dir, additional_epochs=0):
             nlp.update(batch, drop=0.2, losses=losses)
         print(f"Losses at iteration {i}: {losses}")
 
-    # Save the trained model
     nlp.to_disk(model_dir)
     print(f"Model saved to: {model_dir}")
 
-def load_model_and_predict(model_dir, text, tok_k = 3):
-    # Load the trained model from the specified directory
+def load_model_and_predict(model_dir, text, tok_k=3):
     nlp = spacy.load(model_dir)
-    
-    # Process the text with the loaded model
     doc = nlp(text)
-    
-    # gee top 3 categories
     top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
     print(f"Top {tok_k} categories:")
-    
-    return top_categories    
+    for category, score in top_categories:
+        print(f"{category}: {score:.4f}")
+    return top_categories
 
 if __name__ == "__main__":
+    model_directory = "models/reuters"
     train_and_save_reuters_model()
-    train_model("models/reuters", additional_epochs=5)
-    model_directory = "reuters_model_10"
-    print(reuters.categories())
+    train_model(model_directory, additional_epochs=5)
     example_text = "Apple Inc. is reportedly buying a startup for $1 billion"
-    r =load_model_and_predict(model_directory, example_text)
-    print(r)
\ No newline at end of file
+    load_model_and_predict(model_directory, example_text)
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 71a36ae..be65171 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -131,7 +131,7 @@ def split_and_parse_json_objects(json_string):
     return parsed_objects, unparsed_segments
 
 def sanitize_html(html):
-    # Replace all weird and special characters with an empty string
+    # Replace all unwanted and special characters with an empty string
     sanitized_html = html
     # sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html)
 
@@ -301,7 +301,7 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
             if tag.name != 'img':
                 tag.attrs = {}
 
-        # Extract all img tgas inti [{src: '', alt: ''}]
+        # Extract all img tags into [{src: '', alt: ''}]
         media = {
             'images': [],
             'videos': [],
@@ -339,7 +339,7 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
                 img.decompose()
 
 
-        # Create a function that replace content of all"pre" tage with its inner text
+        # Create a function that replace content of all "pre" tag with its inner text
         def replace_pre_tags_with_text(node):
             for child in node.find_all('pre'):
                 # set child inner html to its text
@@ -502,7 +502,7 @@ def find_closest_parent_with_useful_text(tag):
             current_tag = tag
             while current_tag:
                 current_tag = current_tag.parent
-                # Get the text content of the parent tag
+                # Get the text content from the parent tag
                 if current_tag:
                     text_content = current_tag.get_text(separator=' ',strip=True)
                     # Check if the text content has at least word_count_threshold
diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py
index 7dea56c..26d2d1a 100644
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -1,4 +1,5 @@
 import os, time
+import json
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 from pathlib import Path
 
@@ -234,4 +235,4 @@ def process_html(
                 extracted_content=extracted_content,
                 success=True,
                 error_message="",
-            )
\ No newline at end of file
+            )