diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..7737147
Binary files /dev/null and b/.DS_Store differ
diff --git a/.gitattributes b/.gitattributes
index dfe0770..4d77b01 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1,4 @@
-# Auto detect text files and perform LF normalization
-* text=auto
+*.java linguist-detectable=true
+*.js linguist-detectable=false
+*.html linguist-detectable=false
+*.xml linguist-detectable=false
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d91b011
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+
+.java-version
+/target
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..26d3352
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/.idea/compiler.xml b/.idea/compiler.xml
new file mode 100644
index 0000000..47da706
--- /dev/null
+++ b/.idea/compiler.xml
@@ -0,0 +1,22 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/encodings.xml b/.idea/encodings.xml
new file mode 100644
index 0000000..8a81040
--- /dev/null
+++ b/.idea/encodings.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/jarRepositories.xml b/.idea/jarRepositories.xml
new file mode 100644
index 0000000..3d41add
--- /dev/null
+++ b/.idea/jarRepositories.xml
@@ -0,0 +1,45 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..bff2946
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,15 @@
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/uiDesigner.xml b/.idea/uiDesigner.xml
new file mode 100644
index 0000000..2b63946
--- /dev/null
+++ b/.idea/uiDesigner.xml
@@ -0,0 +1,124 @@
+
+
+
+
+ -
+
+
+ -
+
+
+ -
+
+
+ -
+
+
+ -
+
+
+
+
+
+ -
+
+
+
+
+
+ -
+
+
+
+
+
+ -
+
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+ -
+
+
+ -
+
+
+ -
+
+
+ -
+
+
+
+
+ -
+
+
+ -
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 4343728..633b11b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,10 +5,10 @@
4.0.0
com.mule.mulechain
- mulechain-web-crawler
- 0.0.0
+ mac-web-crawler
+ 0.1.0
mule-extension
- Mulechain-web-crawler Extension
+ mac-web-crawler Extension
org.mule.extensions
@@ -20,7 +20,13 @@
org.jsoup
jsoup
1.17.2
-
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+ 2.15.2
+
+
anypoint-exchange-v3
diff --git a/src/.DS_Store b/src/.DS_Store
new file mode 100644
index 0000000..5e305c0
Binary files /dev/null and b/src/.DS_Store differ
diff --git a/src/main/java/com/mule/mulechain/crawler/internal/MulechainwebcrawlerConfiguration.java b/src/main/java/com/mule/mulechain/crawler/internal/MulechainwebcrawlerConfiguration.java
index ea86d02..a2cf973 100644
--- a/src/main/java/com/mule/mulechain/crawler/internal/MulechainwebcrawlerConfiguration.java
+++ b/src/main/java/com/mule/mulechain/crawler/internal/MulechainwebcrawlerConfiguration.java
@@ -1,7 +1,11 @@
package com.mule.mulechain.crawler.internal;
import org.mule.runtime.extension.api.annotation.Operations;
+import org.mule.runtime.extension.api.annotation.param.Optional;
import org.mule.runtime.extension.api.annotation.param.Parameter;
+import org.mule.runtime.extension.api.annotation.param.display.DisplayName;
+
+import java.util.List;
/**
* This class represents an extension configuration, values set in this class are commonly used across multiple
@@ -11,9 +15,16 @@
public class MulechainwebcrawlerConfiguration {
@Parameter
- private String configId;
+ @Optional
+ @DisplayName("Tag List")
+ private List tags;
+
+ // Getters and Setters
+ public List getTags() {
+ return this.tags;
+ }
- public String getConfigId(){
- return configId;
+ public void setTags(List tags) {
+ this.tags = tags;
}
}
diff --git a/src/main/java/com/mule/mulechain/crawler/internal/MulechainwebcrawlerExtension.java b/src/main/java/com/mule/mulechain/crawler/internal/MulechainwebcrawlerExtension.java
index f10f22b..41574e7 100644
--- a/src/main/java/com/mule/mulechain/crawler/internal/MulechainwebcrawlerExtension.java
+++ b/src/main/java/com/mule/mulechain/crawler/internal/MulechainwebcrawlerExtension.java
@@ -9,8 +9,8 @@
* This is the main class of an extension, is the entry point from which configurations, connection providers, operations
* and sources are going to be declared.
*/
-@Xml(prefix = "mulechain-web-crawler")
-@Extension(name = "Mulechain Crawler")
+@Xml(prefix = "mac-web-crawler")
+@Extension(name = "MAC WebCrawler")
@Configurations(MulechainwebcrawlerConfiguration.class)
public class MulechainwebcrawlerExtension {
diff --git a/src/main/java/com/mule/mulechain/crawler/internal/MulechainwebcrawlerOperations.java b/src/main/java/com/mule/mulechain/crawler/internal/MulechainwebcrawlerOperations.java
index 330f4cd..26f9c00 100644
--- a/src/main/java/com/mule/mulechain/crawler/internal/MulechainwebcrawlerOperations.java
+++ b/src/main/java/com/mule/mulechain/crawler/internal/MulechainwebcrawlerOperations.java
@@ -1,29 +1,415 @@
package com.mule.mulechain.crawler.internal;
-import static org.mule.runtime.extension.api.annotation.param.MediaType.ANY;
-
+import com.mule.mulechain.crawler.internal.helpers.CrawlResult;
+import com.mule.mulechain.crawler.internal.helpers.SiteMapNode;
+import com.mule.mulechain.crawler.internal.helpers.crawlingHelper;
+import org.jsoup.UnsupportedMimeTypeException;
+import org.jsoup.nodes.Document;
import org.mule.runtime.extension.api.annotation.Alias;
-import org.mule.runtime.extension.api.annotation.param.MediaType;
import org.mule.runtime.extension.api.annotation.param.Config;
-import java.io.IOException;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
+import org.mule.runtime.extension.api.annotation.param.MediaType;
+import org.mule.runtime.extension.api.annotation.param.display.DisplayName;
+import org.mule.runtime.extension.api.annotation.param.display.Example;
+import org.mule.runtime.extension.api.annotation.param.display.Placement;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.net.URL;
+import java.text.SimpleDateFormat;
+import java.util.*;
+
+import static org.mule.runtime.extension.api.annotation.param.MediaType.ANY;
/**
* This class is a container for operations, every public method in this class will be taken as an extension operation.
*/
public class MulechainwebcrawlerOperations {
+ private enum CrawlType {
+ CONTENT,
+ LINK
+ }
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(MulechainwebcrawlerOperations.class);
+
+ /**
+ * Crawl a website at a specified depth and fetch contents. Specify tags and classes in the configuration to fetch contents from those elements only.
+ *
+ * @throws IOException
+ */
+
+ /* JSoup limitiations / web crawl challenges
+ - some sites prevent robots - use of User-Agent may be required but not always guaranteed to work
+ - JavaScript generated content is not read by jsoup
+ - some sites require cookies or sessions to be present
+ */
+ @MediaType(value = ANY, strict = false)
+ @Alias("Crawl-website")
+ public String crawlWebsite(@Config MulechainwebcrawlerConfiguration configuration,
+ @DisplayName("Website URL") @Placement(order = 1) @Example("https://mac-project.ai/docs") String url,
+ @DisplayName("Maximum Depth") @Placement(order = 2) @Example("2") int maxDepth,
+ @DisplayName("Retrieve Meta Tags") @Placement(order = 3) @Example("Yes") boolean getMetaTags,
+ @DisplayName("Download Images") @Placement(order = 4) @Example("Yes") boolean downloadImages,
+ @DisplayName("Download Location") @Placement(order = 5) @Example("/users/mulesoft/downloads") String downloadPath) throws IOException {
+ LOGGER.info("Website crawl action");
+
+
+ // initialise variables
+ Set visitedLinksGlobal = new HashSet<>();
+ Map> visitedLinksByDepth = new HashMap<>();
+ List specificTags = configuration.getTags();
+
+ SiteMapNode root = startCrawling(url, 0, maxDepth, visitedLinksByDepth, visitedLinksGlobal, downloadImages, downloadPath, specificTags, getMetaTags, CrawlType.CONTENT);
+
+
+ return crawlingHelper.convertToJSON(root);
+ }
+
+
+ /**
+ * Fetch the meta tags from a web page.
+ */
+ @MediaType(value = ANY, strict = false)
+ @Alias("Get-page-meta-tags")
+ public String getMetaTags (
+ @DisplayName("Page URL") @Placement(order = 1) @Example("https://mac-project.ai/docs") String url) throws IOException {
+ LOGGER.info("Get meta tags");
+
+ Document document = crawlingHelper.getDocument(url);
+
+ return crawlingHelper.convertToJSON(crawlingHelper.getPageMetaTags(document));
+ }
+
/**
- * Example of an operation that uses the configuration and a connection instance to perform some action.
- * @throws IOException
+ * Retrieve internal links as a site map from the specified url and depth.
*/
-
@MediaType(value = ANY, strict = false)
- @Alias("Crawl-website")
- public String crawlWebsite(String url, @Config MulechainwebcrawlerConfiguration configuration) throws IOException{
+ @Alias("Generate-sitemap")
+ public String getSiteMap (
+ @DisplayName("Website URL") @Placement(order = 1) @Example("https://mac-project.ai/docs") String url,
+ @DisplayName("Maximum Depth") @Placement(order = 2) @Example("2") int maxDepth) throws IOException {
+ LOGGER.info("Generate sitemap");
+
+ // initialise variables
+ Set visitedLinksGlobal = new HashSet<>();
+ Map> visitedLinksByDepth = new HashMap<>();
- return "";
+ SiteMapNode root = startCrawling(url, 0, maxDepth, visitedLinksByDepth, visitedLinksGlobal, false, null, null, false, CrawlType.LINK);
+
+ return crawlingHelper.convertToJSON(root);
}
+ /**
+ * Download all images from a web page, or download a single image at the specified link.
+ */
+ @MediaType(value = ANY, strict = false)
+ @Alias("Download-image")
+ public String downloadWebsiteImages (
+ @DisplayName("Page Or Image URL") @Placement(order = 1) @Example("https://mac-project.ai/docs") String url,
+ @DisplayName("Download Location") @Placement(order = 2) @Example("/users/mulesoft/downloads") String downloadPath) throws IOException {
+
+ String result = "";
+
+ try {
+ // url provided is a website url, so download all images from this document
+ Document document = crawlingHelper.getDocument(url);
+ result = crawlingHelper.convertToJSON(downloadWebsiteImages(document, downloadPath));
+ }
+ catch (UnsupportedMimeTypeException e) {
+ // url provided is direct link to image, so download single image
+
+ Map linkFileMap = new HashMap<>();
+ linkFileMap.put(url, downloadSingleImage(url, downloadPath));
+ result = crawlingHelper.convertToJSON(linkFileMap);
+ }
+ return result;
+ }
+
+
+ /**
+ * Get insights from a web page including links, word count, number of occurrences of elements. Restrict insights to specific elements in the configuration.
+ */
+ @MediaType(value = ANY, strict = false)
+ @Alias("Get-page-insights")
+ public String getPageInsights(
+ @Config MulechainwebcrawlerConfiguration configuration,
+ @DisplayName("Page Url") @Placement(order = 1) @Example("https://mac-project.ai/docs") String url) throws IOException {
+ LOGGER.info("Analyze page");
+
+ Document document = crawlingHelper.getDocument(url);
+
+ return crawlingHelper.convertToJSON(crawlingHelper.getPageInsights(document, configuration.getTags(), crawlingHelper.PageInsightType.ALL));
+ }
+
+
+ /**
+ * Get contents of a web page. Content is returned in the resulting payload.
+ */
+ @MediaType(value = ANY, strict = false)
+ @Alias("Get-page-content")
+ public String getPageContent(
+ @Config MulechainwebcrawlerConfiguration configuration,
+ @DisplayName("Page Url") @Placement(order = 1) @Example("https://mac-project.ai/docs") String url) throws IOException {
+ LOGGER.info("Get page content");
+
+ Map contents = new HashMap();
+
+ Document document = crawlingHelper.getDocument(url);
+
+ contents.put("url", document.baseUri());
+ contents.put("title", document.title());
+ contents.put("content", crawlingHelper.getPageContent(document, configuration.getTags()));
+
+ return crawlingHelper.convertToJSON(contents);
+ }
+
+
+ private String savePageContents(Object results, String downloadPath, String title) throws IOException {
+
+ String pageContents = crawlingHelper.convertToJSON(results);
+
+ String fileName = "";
+
+ // Generate a unique filename using the current timestamp
+ String timestamp = new SimpleDateFormat("yyyyMMddHHmmssSSS").format(new Date());
+
+
+ // Create a unique filename based on the sanitized title
+ fileName = crawlingHelper.getSanitizedFilename(title) + "_" + timestamp + ".json";
+
+ // Write JSON content to the file
+ // Ensure the output directory exists
+ File file = new File(downloadPath, fileName);
+ // Ensure the directory exists
+ file.getParentFile().mkdirs();
+
+ try (BufferedWriter writer = new BufferedWriter(new FileWriter(file))) {
+ // Write content to the file
+ writer.write(pageContents);
+ LOGGER.info("Saved content to file: " + fileName);
+ } catch (IOException e) {
+ LOGGER.error("An error occurred while writing to the file: " + e.getMessage());
+ }
+
+ return (file != null) ? file.getName() : "File is null";
+ }
+
+
+ //private String startCrawling(String url, int depth, int maxDepth, Set visitedLinks, boolean downloadImages, String downloadPath, List tags) {
+ private SiteMapNode startCrawling(String url, int depth, int maxDepth, Map> visitedLinksByDepth, Set visitedLinksGlobal, boolean downloadImages, String downloadPath, List contentTags, boolean getMetaTags, CrawlType crawlType ) {
+
+ // return if maxDepth reached
+ if (depth > maxDepth) {
+ return null;
+ }
+
+ // Initialize the set for the current depth if not already present
+ visitedLinksByDepth.putIfAbsent(depth, new HashSet<>());
+
+ // Check if this URL has already been visited at this depth
+ if (visitedLinksByDepth.get(depth).contains(url)) {
+ return null;
+ }
+
+ // crawl & extract current page
+ try {
+
+ // Mark the URL as visited for this depth
+ visitedLinksByDepth.get(depth).add(url);
+
+ SiteMapNode node = null;
+
+ // get page as a html document
+ Document document = crawlingHelper.getDocument(url);
+
+
+ // check if url contents have been downloaded before ie applied globally (at all depths). Note, we don't want to do this globally for CrawlType.LINK because we want a link to be unique only at the depth level and not globally (at all depths)
+ if (!visitedLinksGlobal.contains(url) && crawlType == CrawlType.CONTENT) {
+
+ // add url to urlContentFetched to indicate content has been fetched.
+ visitedLinksGlobal.add(url);
+
+ // Create Map to hold all data for the current page - this will be serialized to JSON and saved to file
+ Map pageData = new HashMap<>();
+
+
+ LOGGER.info("Fetching content for : " + url);
+
+ String title = document.title();
+
+ pageData.put("url", url);
+ pageData.put("title", title);
+
+
+ // check if need to download images in the current page
+ if (downloadImages) {
+ LOGGER.info("Downloading images for : " + url);
+ pageData.put("imageFiles", downloadWebsiteImages(document, downloadPath));
+ }
+
+
+ // get all meta tags from the document
+ if (getMetaTags) {
+ // Iterating over each entry in the map
+ for (Map.Entry entry : crawlingHelper.getPageMetaTags(document).entrySet()) {
+ pageData.put(entry.getKey(), entry.getValue());
+ }
+ }
+
+
+ // get page contents
+ pageData.put("content", crawlingHelper.getPageContent(document, contentTags));
+
+
+ // save gathered data of page to file
+ String filename = savePageContents(pageData, downloadPath, title);
+
+
+ // Create a new node for this URL
+ node = new CrawlResult(url, filename);
+
+ }
+ else if (crawlType == CrawlType.LINK) {
+ node = new SiteMapNode(url);
+ LOGGER.info("Found url : " + url);
+ }
+ else {
+ // content previously downloaded, so setting file name as such
+ node = new CrawlResult(url, "Duplicate.");
+ }
+
+
+ // If not at max depth, find and crawl the links on the page
+ if (depth <= maxDepth) {
+ // get all links on the current page
+ Set links = new HashSet<>();
+
+ Map linksMap = (Map) crawlingHelper.getPageInsights(document, null, crawlingHelper.PageInsightType.INTERNALLINKS).get("links");
+ if (linksMap != null) {
+ links = (Set) linksMap.get("internal"); // Cast to Set
+ }
+
+ if (links != null) {
+ for (String nextUrl : links) {
+
+ // Recursively crawl the link and add as a child
+ SiteMapNode childNode = startCrawling(nextUrl, depth + 1, maxDepth, visitedLinksByDepth, visitedLinksGlobal, downloadImages, downloadPath, contentTags, getMetaTags, crawlType);
+ if (childNode != null) {
+ node.addChild(childNode);
+ }
+ }
+ }
+ }
+ return node;
+ } catch (Exception e) {
+ LOGGER.error(e.toString());
+ }
+ return null;
+ }
+
+ private Map downloadWebsiteImages(Document document, String saveDirectory) throws IOException {
+ // List to store image URLs
+ Set imageUrls = new HashSet<>();
+
+ Map linkFileMap = new HashMap<>();
+
+ Map linksMap = (Map) crawlingHelper.getPageInsights(document, null, crawlingHelper.PageInsightType.IMAGELINKS).get("links");
+ if (linksMap != null) {
+ imageUrls = (Set) linksMap.get("images"); // Cast to Set
+ }
+
+ if (imageUrls != null) {
+
+ // Save all images found on the page
+ LOGGER.info("Number of img[src] elements found : " + imageUrls.size());
+ for (String imageUrl : imageUrls) {
+ linkFileMap.put(imageUrl, downloadSingleImage(imageUrl, saveDirectory));
+ }
+ }
+ return linkFileMap;
+ }
+
+ private String downloadSingleImage(String imageUrl, String saveDirectory) throws IOException{
+ LOGGER.info("Found image : " + imageUrl);
+ File file;
+ try {
+ // Check if the URL is a Data URL
+ if (imageUrl.startsWith("data:image/")) {
+ // Extract base64 data from the Data URL
+ String base64Data = imageUrl.substring(imageUrl.indexOf(",") + 1);
+
+ if (base64Data.isEmpty()) {
+ LOGGER.info("Base64 data is empty for URL: " + imageUrl);
+ return "";
+ }
+
+ // Decode the base64 data
+ byte[] imageBytes;
+
+ try {
+ imageBytes = Base64.getDecoder().decode(base64Data);
+ } catch (IllegalArgumentException e) {
+ LOGGER.info("Error decoding base64 data: " + e.getMessage());
+ return "";
+ }
+
+ if (imageBytes.length == 0) {
+ LOGGER.info("Decoded image bytes are empty for URL: " + imageUrl);
+ return "";
+ }
+
+ // Determine the file extension from the Data URL
+ String fileType = imageUrl.substring(5, imageUrl.indexOf(";"));
+ String fileExtension = fileType.split("/")[1];
+
+ // Generate a unique filename using the current timestamp
+ String timestamp = new SimpleDateFormat("yyyyMMddHHmmssSSS").format(new Date());
+ String fileName = "image_" + timestamp + "." + fileExtension;
+ file = new File(saveDirectory, fileName);
+
+ // Ensure the directory exists
+ file.getParentFile().mkdirs();
+
+ // Write the decoded bytes to the file
+ try (FileOutputStream out = new FileOutputStream(file)) {
+ out.write(imageBytes);
+ LOGGER.info("DataImage saved: " + file.getAbsolutePath());
+ }
+ } else {
+ // Handle standard image URLs
+ URL url = new URL(imageUrl);
+
+ // Extract the 'url' parameter from the query string
+ String decodedUrl = crawlingHelper.extractAndDecodeUrl(imageUrl);
+ // Extract the filename from the decoded URL
+ String fileName = crawlingHelper.extractFileNameFromUrl(decodedUrl);
+
+ //String fileName = decodedUrl.substring(imageUrl.lastIndexOf("/") + 1);
+ file = new File(saveDirectory, fileName);
+
+ // Ensure the directory exists
+ file.getParentFile().mkdirs();
+
+ // Download and save the image
+ try (InputStream in = url.openStream();
+ FileOutputStream out = new FileOutputStream(file)) {
+
+ byte[] buffer = new byte[1024];
+ int bytesRead;
+ while ((bytesRead = in.read(buffer)) != -1) {
+ out.write(buffer, 0, bytesRead);
+ }
+ }
+ LOGGER.info("Image saved: " + file.getAbsolutePath());
+
+ }
+ } catch (IOException e) {
+ LOGGER.error("Error saving image: " + imageUrl);
+ throw e;
+ }
+
+ return (file != null) ? file.getName() : "File is null";
+ }
}
+
diff --git a/src/main/java/com/mule/mulechain/crawler/internal/helpers/CrawlResult.java b/src/main/java/com/mule/mulechain/crawler/internal/helpers/CrawlResult.java
new file mode 100644
index 0000000..1daeda4
--- /dev/null
+++ b/src/main/java/com/mule/mulechain/crawler/internal/helpers/CrawlResult.java
@@ -0,0 +1,15 @@
+package com.mule.mulechain.crawler.internal.helpers;
+
+public class CrawlResult extends SiteMapNode {
+ private String fileName;
+
+ public CrawlResult(String url, String fileName) {
+ super(url);
+ this.fileName = fileName;
+
+ }
+
+ public String getFileName() {
+ return fileName;
+ }
+}
diff --git a/src/main/java/com/mule/mulechain/crawler/internal/helpers/SiteMapNode.java b/src/main/java/com/mule/mulechain/crawler/internal/helpers/SiteMapNode.java
new file mode 100644
index 0000000..5ee2c90
--- /dev/null
+++ b/src/main/java/com/mule/mulechain/crawler/internal/helpers/SiteMapNode.java
@@ -0,0 +1,27 @@
+package com.mule.mulechain.crawler.internal.helpers;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class SiteMapNode {
+ private String url;
+ private List children;
+
+ public SiteMapNode(String url) {
+ this.url = url;
+ this.children = new ArrayList<>();
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public List getChildren() {
+ return children;
+ }
+
+ public void addChild(SiteMapNode child) {
+ this.children.add(child);
+ }
+}
+
diff --git a/src/main/java/com/mule/mulechain/crawler/internal/helpers/crawlingHelper.java b/src/main/java/com/mule/mulechain/crawler/internal/helpers/crawlingHelper.java
index 7f9715d..d7dfb4b 100644
--- a/src/main/java/com/mule/mulechain/crawler/internal/helpers/crawlingHelper.java
+++ b/src/main/java/com/mule/mulechain/crawler/internal/helpers/crawlingHelper.java
@@ -1,19 +1,269 @@
package com.mule.mulechain.crawler.internal.helpers;
-import java.io.IOException;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.*;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
public class crawlingHelper {
-
- private static String getTitle(String url, String outputFolder) throws IOException{
- Document doc = connectUrlGetDocument(url);
- String title = doc.title();
- //System.out.println("title is: " + title);
- return title;
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(crawlingHelper.class);
+
+ public enum PageInsightType {
+ ALL,
+ INTERNALLINKS,
+ EXTERNALLINKS,
+ REFERENCELINKS,
+ IMAGELINKS,
+ ELEMENTCOUNTSTATS
+ }
+
+
+ public static Document getDocument(String url) throws IOException {
+ // use jsoup to fetch the current page elements
+ Document document = Jsoup.connect(url)
+ //.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
+ //.referrer("http://www.google.com") // to prevent "HTTP error fetching URL. Status=403" error
+ .get();
+
+ return document;
+ }
+
+
+ public static String extractFileNameFromUrl(String url) {
+ // Extract the filename from the URL path
+ String fileName = url.substring(url.lastIndexOf("/") + 1, url.indexOf('?') > 0 ? url.indexOf('?') : url.length());
+
+ // if no extension for image found, then use .jpg as default
+ return fileName.contains(".") ? fileName : fileName + ".jpg";
+ }
+
+ /*
+ "https://wp.salesforce.com/en-ap/wp-content/uploads/sites/14/2024/02/php-marquee-starter-lg-bg.jpg?w=1024",
+ "https://example.com/image?url=%2F_next%2Fstatic%2Fmedia%2Fcard-1.8b03e519.png&w=3840&q=75"
+ */
+ public static String extractAndDecodeUrl(String fullUrl) throws UnsupportedEncodingException, MalformedURLException {
+
+ URL url = new URL(fullUrl);
+ String query = url.getQuery(); // Extract the query string from the URL
+
+ if (query != null) {
+ // Extract and decode the 'url' parameter from the query string
+ String[] params = query.split("&");
+ for (String param : params) {
+ String[] pair = param.split("=");
+ if (pair.length == 2 && "url".equals(pair[0])) {
+ return URLDecoder.decode(pair[1], StandardCharsets.UTF_8.name());
+ }
+ }
+ // If 'url' parameter not found, return the URL without changes
+ return fullUrl;
+ } else {
+ // If there's no query string, return the URL as is
+ return fullUrl;
+ }
}
- private static Document connectUrlGetDocument(String url) throws IOException {
- return Jsoup.connect(url).get();
+
+ public static String convertToJSON(Object contentToSerialize) throws JsonProcessingException{
+ // Convert the result to JSON
+ ObjectMapper mapper = new ObjectMapper();
+ //return mapper.writerWithDefaultPrettyPrinter().writeValueAsString(contentToSerialize);
+ return mapper.writeValueAsString(contentToSerialize);
+ }
+
+
+
+ public static Map getPageMetaTags(Document document) {
+ // Map to store meta tag data
+ Map metaTagData = new HashMap<>();
+
+ // Select all meta tags
+ Elements metaTags = document.select("meta");
+
+ // Iterate through each meta tag
+ for (Element metaTag : metaTags) {
+ // Extract the 'name' or 'property' attribute and 'content' attribute
+ String name = metaTag.attr("name");
+ if (name.isEmpty()) {
+ // If 'name' is not present, check for 'property' (e.g., Open Graph meta tags)
+ name = metaTag.attr("property");
+ }
+ String content = metaTag.attr("content");
+
+ // Only add to map if 'name' or 'property' and 'content' are present
+ if (!name.isEmpty() && !content.isEmpty()) {
+ metaTagData.put(name, content);
+ }
+ }
+ return metaTagData;
+ }
+
+ public static Map getPageInsights(Document document, List tags, PageInsightType insight) throws MalformedURLException{
+ // Map to store page analysis
+ Map pageInsightData = new HashMap<>();
+
+
+ // links set
+ Set internalLinks = new HashSet<>();
+ Set externalLinks = new HashSet<>();
+ Set referenceLinks = new HashSet<>();
+
+ // image-links set
+ Set imageLinks = new HashSet<>();
+
+ // All links Map
+ Map linksMap = new HashMap<>();
+
+ // Map to store the element counts
+ Map elementCounts = new HashMap<>();
+
+
+ String baseUrl = document.baseUri();
+
+ if (insight == PageInsightType.ALL || insight == PageInsightType.INTERNALLINKS || insight == PageInsightType.REFERENCELINKS || insight == PageInsightType.EXTERNALLINKS) {
+ // Select all anchor tags with href attributes
+ Elements links = document.select("a[href]");
+ for (Element link : links) {
+ String href = link.absUrl("href"); // get absolute URLs
+ if (isExternalLink(baseUrl, href)) {
+ externalLinks.add(href);
+ } else if (isReferenceLink(baseUrl, href)) {
+ referenceLinks.add(href);
+ } else {
+ internalLinks.add(href);
+ }
+ }
+
+ if (insight == PageInsightType.ALL || insight == PageInsightType.INTERNALLINKS)
+ linksMap.put("internal", internalLinks);
+ if (insight == PageInsightType.ALL || insight == PageInsightType.EXTERNALLINKS)
+ linksMap.put("external", externalLinks);
+ if (insight == PageInsightType.ALL || insight == PageInsightType.REFERENCELINKS)
+ linksMap.put("reference", referenceLinks);
+ }
+
+
+ if (insight == PageInsightType.ALL || insight == PageInsightType.IMAGELINKS) {
+ // images
+
+ Elements images = document.select("img[src]");
+ for (Element img : images) {
+ String imageUrl = img.absUrl("src");
+ imageLinks.add(imageUrl);
+ }
+
+ linksMap.put("images", imageLinks);
+
+ }
+
+ if (insight == PageInsightType.ALL || insight == PageInsightType.ELEMENTCOUNTSTATS) {
+ String[] elementsToCount = {"div", "p", "h1", "h2", "h3", "h4", "h5"}; // default list of elements to retrieve stats for. Used if no specific tags provided
+
+ if (tags != null && !tags.isEmpty()) {
+ elementsToCount = tags.toArray(new String[tags.size()]);
+ }
+
+ // Loop through each element type and count its occurrences
+ for (String tag : elementsToCount) {
+ Elements elements = document.select(tag);
+ elementCounts.put(tag, elements.size());
+ }
+
+ elementCounts.put("internal", internalLinks.size());
+ elementCounts.put("external", externalLinks.size());
+ elementCounts.put("reference", referenceLinks.size());
+ elementCounts.put("images", imageLinks.size());
+ elementCounts.put("wordCount", countWords(getPageContent(document,tags)));
+
+ pageInsightData.put("pageStats", elementCounts);
+ }
+
+ pageInsightData.put("url", document.baseUri());
+ pageInsightData.put("title", document.title());
+
+ // only add links if any of the types in condition has been requested
+ if (insight == PageInsightType.ALL || insight == PageInsightType.INTERNALLINKS || insight == PageInsightType.REFERENCELINKS || insight == PageInsightType.EXTERNALLINKS || insight == PageInsightType.IMAGELINKS)
+ pageInsightData.put("links", linksMap);
+
+ return pageInsightData;
+ }
+
+ public static String getPageContent(Document document, List tags) {
+
+ StringBuilder collectedText = new StringBuilder();
+
+ // check if crawl should only iterate over specified tags and extract contents from these tags only
+ if (tags != null && !tags.isEmpty()) {
+ for (String selector : tags) {
+ Elements elements = document.select(selector);
+ for (Element element : elements) {
+ collectedText.append(element.text()).append(" ");
+ }
+ }
+ }
+ else {
+ // Extract the text content of the page and add it to the collected text
+ String textContent = document.text();
+ collectedText.append(textContent);
+ }
+
+ return collectedText.toString().trim();
+ }
+
+ // Method to count words in a given text
+ private static int countWords(String text) {
+ if (text == null || text.trim().isEmpty()) {
+ return 0;
+ }
+ // Split the text by whitespace and count the words
+ String[] words = text.trim().split("\\s+");
+ return words.length;
+ }
+
+
+ public static String getSanitizedFilename(String title) {
+ // Replace invalid characters with underscores
+ return title.replaceAll("[\\\\/:*?\"<>|]", "_").replaceAll(" ", "");
+ }
+
+ // Method to determine if a link is a reference link to the same page
+ // baseUrl: "https://docs.mulesoft.com/cloudhub-2/ch2-architecture"
+ // linkToCheck: "https://docs.mulesoft.com/cloudhub-2/ch2-architecture#cluster-nodes"
+ // If current page has a reference link to another page, this link will not be considered as a reference link
+ private static boolean isReferenceLink(String baseUrl, String linkToCheck) {
+ try {
+ URI baseUri = new URI(baseUrl);
+ URI linkUri = new URI(linkToCheck);
+
+ // Check if the scheme, host, and path are the same, and the link has a fragment
+ return baseUri.getScheme().equals(linkUri.getScheme()) &&
+ baseUri.getHost().equals(linkUri.getHost()) &&
+ baseUri.getPath().equals(linkUri.getPath()) &&
+ linkUri.getFragment() != null;
+
+ } catch (URISyntaxException e) {
+ LOGGER.error(e.toString());
+ return false;
+ }
+ }
+
+ private static boolean isExternalLink(String baseUrl, String linkToCheck) throws MalformedURLException {
+ // Extract the base domain from the base URI
+ URL parsedUrl = new URL(baseUrl);
+ String baseDomain = parsedUrl.getHost();
+
+ return !linkToCheck.contains(baseDomain);
+
}
}
diff --git a/target/classes/META-INF/mac-webcrawler-extension-descriptions.xml b/target/classes/META-INF/mac-webcrawler-extension-descriptions.xml
new file mode 100644
index 0000000..07f1c86
--- /dev/null
+++ b/target/classes/META-INF/mac-webcrawler-extension-descriptions.xml
@@ -0,0 +1,145 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/target/classes/META-INF/mule-artifact/mule-artifact.json b/target/classes/META-INF/mule-artifact/mule-artifact.json
new file mode 100644
index 0000000..930122c
--- /dev/null
+++ b/target/classes/META-INF/mule-artifact/mule-artifact.json
@@ -0,0 +1,25 @@
+{
+ "extensionModelLoaderDescriptor": {
+ "id": "java",
+ "attributes": {
+ "type": "com.mule.mulechain.crawler.internal.MulechainwebcrawlerExtension",
+ "version": "0.1.0"
+ }
+ },
+ "name": "MAC WebCrawler",
+ "requiredProduct": "MULE",
+ "classLoaderModelLoaderDescriptor": {
+ "id": "mule",
+ "attributes": {
+ "privilegedExportedPackages": [],
+ "privilegedArtifactIds": [],
+ "exportedPackages": [],
+ "exportedResources": []
+ }
+ },
+ "bundleDescriptorLoader": {
+ "id": "mule",
+ "attributes": {}
+ },
+ "minMuleVersion": "4.1.1"
+}
\ No newline at end of file
diff --git a/target/classes/com/mule/mulechain/crawler/internal/MulechainwebcrawlerConfiguration.class b/target/classes/com/mule/mulechain/crawler/internal/MulechainwebcrawlerConfiguration.class
index 93ff773..ceddfc2 100644
Binary files a/target/classes/com/mule/mulechain/crawler/internal/MulechainwebcrawlerConfiguration.class and b/target/classes/com/mule/mulechain/crawler/internal/MulechainwebcrawlerConfiguration.class differ
diff --git a/target/classes/com/mule/mulechain/crawler/internal/MulechainwebcrawlerExtension.class b/target/classes/com/mule/mulechain/crawler/internal/MulechainwebcrawlerExtension.class
index 673bdc6..1fa2fb1 100644
Binary files a/target/classes/com/mule/mulechain/crawler/internal/MulechainwebcrawlerExtension.class and b/target/classes/com/mule/mulechain/crawler/internal/MulechainwebcrawlerExtension.class differ
diff --git a/target/classes/com/mule/mulechain/crawler/internal/MulechainwebcrawlerOperations$CrawlType.class b/target/classes/com/mule/mulechain/crawler/internal/MulechainwebcrawlerOperations$CrawlType.class
new file mode 100644
index 0000000..fdf18a3
Binary files /dev/null and b/target/classes/com/mule/mulechain/crawler/internal/MulechainwebcrawlerOperations$CrawlType.class differ
diff --git a/target/classes/com/mule/mulechain/crawler/internal/MulechainwebcrawlerOperations.class b/target/classes/com/mule/mulechain/crawler/internal/MulechainwebcrawlerOperations.class
index c4fc5b2..6b060cb 100644
Binary files a/target/classes/com/mule/mulechain/crawler/internal/MulechainwebcrawlerOperations.class and b/target/classes/com/mule/mulechain/crawler/internal/MulechainwebcrawlerOperations.class differ
diff --git a/target/classes/com/mule/mulechain/crawler/internal/helpers/CrawlResult.class b/target/classes/com/mule/mulechain/crawler/internal/helpers/CrawlResult.class
new file mode 100644
index 0000000..8971ebf
Binary files /dev/null and b/target/classes/com/mule/mulechain/crawler/internal/helpers/CrawlResult.class differ
diff --git a/target/classes/com/mule/mulechain/crawler/internal/helpers/SiteMapNode.class b/target/classes/com/mule/mulechain/crawler/internal/helpers/SiteMapNode.class
new file mode 100644
index 0000000..3ab1f8c
Binary files /dev/null and b/target/classes/com/mule/mulechain/crawler/internal/helpers/SiteMapNode.class differ
diff --git a/target/classes/com/mule/mulechain/crawler/internal/helpers/crawlingHelper$PageInsightType.class b/target/classes/com/mule/mulechain/crawler/internal/helpers/crawlingHelper$PageInsightType.class
new file mode 100644
index 0000000..7b26ff9
Binary files /dev/null and b/target/classes/com/mule/mulechain/crawler/internal/helpers/crawlingHelper$PageInsightType.class differ
diff --git a/target/classes/com/mule/mulechain/crawler/internal/helpers/crawlingHelper.class b/target/classes/com/mule/mulechain/crawler/internal/helpers/crawlingHelper.class
index 873bcf7..350bcd2 100644
Binary files a/target/classes/com/mule/mulechain/crawler/internal/helpers/crawlingHelper.class and b/target/classes/com/mule/mulechain/crawler/internal/helpers/crawlingHelper.class differ
diff --git a/target/docs/mac-webcrawler-documentation.adoc b/target/docs/mac-webcrawler-documentation.adoc
new file mode 100644
index 0000000..93812e9
--- /dev/null
+++ b/target/docs/mac-webcrawler-documentation.adoc
@@ -0,0 +1,241 @@
+:toc: left
+:toc-title: MAC WebCrawler Module
+:toclevels: 2
+:last-update-label!:
+:docinfo:
+:source-highlighter: coderay
+:icons: font
+
+
+= MAC WebCrawler Module Documentation Reference
+
++++
+This is the main class of an extension, is the entry point from which configurations, connection providers, operations and sources are going to be declared.
++++
+
+
+== Configurations
+---
+[[config]]
+=== Config
+
++++
+Default configuration
++++
+
+==== Parameters
+[cols=".^20%,.^20%,.^35%,.^20%,^.^5%", options="header"]
+|======================
+| Name | Type | Description | Default Value | Required
+|Name | String | The name for this configuration. Connectors reference the configuration with this name. | | *x*{nbsp}
+| Tag List a| Array of String | | | {nbsp}
+| Expiration Policy a| <> | +++Configures the minimum amount of time that a dynamic configuration instance can remain idle before the runtime considers it eligible for expiration. This does not mean that the platform will expire the instance at the exact moment that it becomes eligible. The runtime will actually purge the instances when it sees it fit.+++ | | {nbsp}
+|======================
+
+
+==== Associated Operations
+* <> {nbsp}
+* <> {nbsp}
+* <> {nbsp}
+
+
+
+== Operations
+
+[[Crawl-website]]
+=== Crawl Website
+``
+
++++
+Crawl a website at a specified depth and fetch contents. Specify tags and classes in the configuration to fetch contents from those elements only.
++++
+
+==== Parameters
+[cols=".^20%,.^20%,.^35%,.^20%,^.^5%", options="header"]
+|======================
+| Name | Type | Description | Default Value | Required
+| Configuration | String | The name of the configuration to use. | | *x*{nbsp}
+| Website URL a| String | | | *x*{nbsp}
+| Maximum Depth a| Number | | | *x*{nbsp}
+| Retrieve Meta Tags a| Boolean | | +++false+++ | {nbsp}
+| Download Images a| Boolean | | +++false+++ | {nbsp}
+| Download Location a| String | | | *x*{nbsp}
+| Output Mime Type a| String | +++The mime type of the payload that this operation outputs.+++ | | {nbsp}
+| Target Variable a| String | +++The name of a variable on which the operation's output will be placed+++ | | {nbsp}
+| Target Value a| String | +++An expression that will be evaluated against the operation's output and the outcome of that expression will be stored in the target variable+++ | +++#[payload]+++ | {nbsp}
+|======================
+
+==== Output
+[cols=".^50%,.^50%"]
+|======================
+| *Type* a| String
+|======================
+
+==== For Configurations.
+* <> {nbsp}
+
+
+
+[[Get-page-content]]
+=== Get Page Content
+``
+
++++
+Get contents of a web page. Content is returned in the resulting payload.
++++
+
+==== Parameters
+[cols=".^20%,.^20%,.^35%,.^20%,^.^5%", options="header"]
+|======================
+| Name | Type | Description | Default Value | Required
+| Configuration | String | The name of the configuration to use. | | *x*{nbsp}
+| Page Url a| String | | | *x*{nbsp}
+| Output Mime Type a| String | +++The mime type of the payload that this operation outputs.+++ | | {nbsp}
+| Target Variable a| String | +++The name of a variable on which the operation's output will be placed+++ | | {nbsp}
+| Target Value a| String | +++An expression that will be evaluated against the operation's output and the outcome of that expression will be stored in the target variable+++ | +++#[payload]+++ | {nbsp}
+|======================
+
+==== Output
+[cols=".^50%,.^50%"]
+|======================
+| *Type* a| String
+|======================
+
+==== For Configurations.
+* <> {nbsp}
+
+
+
+[[Get-page-insights]]
+=== Get Page Insights
+``
+
++++
+Get insights from a web page including links, word count, number of occurrences of elements. Restrict insights to specific elements in the configuration.
++++
+
+==== Parameters
+[cols=".^20%,.^20%,.^35%,.^20%,^.^5%", options="header"]
+|======================
+| Name | Type | Description | Default Value | Required
+| Configuration | String | The name of the configuration to use. | | *x*{nbsp}
+| Page Url a| String | | | *x*{nbsp}
+| Output Mime Type a| String | +++The mime type of the payload that this operation outputs.+++ | | {nbsp}
+| Target Variable a| String | +++The name of a variable on which the operation's output will be placed+++ | | {nbsp}
+| Target Value a| String | +++An expression that will be evaluated against the operation's output and the outcome of that expression will be stored in the target variable+++ | +++#[payload]+++ | {nbsp}
+|======================
+
+==== Output
+[cols=".^50%,.^50%"]
+|======================
+| *Type* a| String
+|======================
+
+==== For Configurations.
+* <> {nbsp}
+
+
+
+[[Download-image]]
+=== Download Image
+``
+
++++
+Download all images from a web page, or download a single image at the specified link.
++++
+
+==== Parameters
+[cols=".^20%,.^20%,.^35%,.^20%,^.^5%", options="header"]
+|======================
+| Name | Type | Description | Default Value | Required
+| Page Or Image URL a| String | | | *x*{nbsp}
+| Download Location a| String | | | *x*{nbsp}
+| Output Mime Type a| String | +++The mime type of the payload that this operation outputs.+++ | | {nbsp}
+| Target Variable a| String | +++The name of a variable on which the operation's output will be placed+++ | | {nbsp}
+| Target Value a| String | +++An expression that will be evaluated against the operation's output and the outcome of that expression will be stored in the target variable+++ | +++#[payload]+++ | {nbsp}
+|======================
+
+==== Output
+[cols=".^50%,.^50%"]
+|======================
+| *Type* a| String
+|======================
+
+
+
+
+[[Generate-sitemap]]
+=== Generate Sitemap
+``
+
++++
+Retrieve internal links as a site map from the specified url and depth.
++++
+
+==== Parameters
+[cols=".^20%,.^20%,.^35%,.^20%,^.^5%", options="header"]
+|======================
+| Name | Type | Description | Default Value | Required
+| Website URL a| String | | | *x*{nbsp}
+| Maximum Depth a| Number | | | *x*{nbsp}
+| Output Mime Type a| String | +++The mime type of the payload that this operation outputs.+++ | | {nbsp}
+| Target Variable a| String | +++The name of a variable on which the operation's output will be placed+++ | | {nbsp}
+| Target Value a| String | +++An expression that will be evaluated against the operation's output and the outcome of that expression will be stored in the target variable+++ | +++#[payload]+++ | {nbsp}
+|======================
+
+==== Output
+[cols=".^50%,.^50%"]
+|======================
+| *Type* a| String
+|======================
+
+
+
+
+[[Get-page-meta-tags]]
+=== Get Page Meta Tags
+``
+
++++
+Fetch the meta tags from a web page.
++++
+
+==== Parameters
+[cols=".^20%,.^20%,.^35%,.^20%,^.^5%", options="header"]
+|======================
+| Name | Type | Description | Default Value | Required
+| Page URL a| String | | | *x*{nbsp}
+| Output Mime Type a| String | +++The mime type of the payload that this operation outputs.+++ | | {nbsp}
+| Target Variable a| String | +++The name of a variable on which the operation's output will be placed+++ | | {nbsp}
+| Target Value a| String | +++An expression that will be evaluated against the operation's output and the outcome of that expression will be stored in the target variable+++ | +++#[payload]+++ | {nbsp}
+|======================
+
+==== Output
+[cols=".^50%,.^50%"]
+|======================
+| *Type* a| String
+|======================
+
+
+
+
+
+== Types
+[[ExpirationPolicy]]
+=== Expiration Policy
+
+[cols=".^20%,.^25%,.^30%,.^15%,.^10%", options="header"]
+|======================
+| Field | Type | Description | Default Value | Required
+| Max Idle Time a| Number | A scalar time value for the maximum amount of time a dynamic configuration instance should be allowed to be idle before it's considered eligible for expiration | |
+| Time Unit a| Enumeration, one of:
+
+** NANOSECONDS
+** MICROSECONDS
+** MILLISECONDS
+** SECONDS
+** MINUTES
+** HOURS
+** DAYS | A time unit that qualifies the maxIdleTime attribute | |
+|======================
+
diff --git a/target/docs/mulechain-crawler-documentation.html b/target/docs/mac-webcrawler-documentation.html
similarity index 70%
rename from target/docs/mulechain-crawler-documentation.html
rename to target/docs/mac-webcrawler-documentation.html
index c318eff..6baaea5 100644
--- a/target/docs/mulechain-crawler-documentation.html
+++ b/target/docs/mac-webcrawler-documentation.html
@@ -5,7 +5,7 @@
-Mulechain Crawler Module Documentation Reference
+MAC WebCrawler Module Documentation Reference