[Commoncrawl pipeline] Add component extract free-to-use images (#282)

This 3rd component extracts the image url, alt text and license metadata from the webpage url and html code.
ml6team · Jul 20, 2023 · b8bfaef · b8bfaef
1 parent be7bd0b
commit b8bfaef
Show file tree

Hide file tree

Showing 7 changed files with 303 additions and 0 deletions.
diff --git a/examples/pipelines/commoncrawl/components/extract_image_licenses/Dockerfile b/examples/pipelines/commoncrawl/components/extract_image_licenses/Dockerfile
@@ -0,0 +1,18 @@
+FROM --platform=linux/amd64 python:3.8-slim
+
+## System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# install requirements
+COPY requirements.txt /
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Set the working directory to the component folder
+WORKDIR /component/src
+
+# Copy over src-files
+COPY src/ .
+
+ENTRYPOINT ["python", "main.py"]
diff --git a/examples/pipelines/commoncrawl/components/extract_image_licenses/README.md b/examples/pipelines/commoncrawl/components/extract_image_licenses/README.md
@@ -0,0 +1,8 @@
+# extract_image_licenses
+
+### Description
+This components extracts image url and license metadata from a dataframe of webpage url and html code.
+
+### **Inputs/Outputs**
+
+See [`fondant_component.yaml`](fondant_component.yaml) for a more detailed description on all the input/output parameters. 
diff --git a/examples/pipelines/commoncrawl/components/extract_image_licenses/fondant_component.yaml b/examples/pipelines/commoncrawl/components/extract_image_licenses/fondant_component.yaml
@@ -0,0 +1,25 @@
+name: Extract image url and license from commoncrawl
+description: Component that extracts image url and license metadata from a dataframe of webpage urls and html codes
+image: ghcr.io/ml6team/extract_image_licenses:latest
+
+consumes:
+  webpage:
+    fields:
+      url:
+        type: string
+      html:
+        type: string
+
+produces:
+  image:
+    fields:
+      image_url:
+        type: string
+      alt_text:
+        type: string
+      webpage_url:
+        type: string
+      license_type:
+        type: string
+      license_location:
+        type: string
diff --git a/examples/pipelines/commoncrawl/components/extract_image_licenses/requirements.txt b/examples/pipelines/commoncrawl/components/extract_image_licenses/requirements.txt
@@ -0,0 +1,2 @@
+beautifulsoup4==4.12.2
+git+https://github.com/ml6team/fondant@main
diff --git a/examples/pipelines/commoncrawl/components/extract_image_licenses/src/main.py b/examples/pipelines/commoncrawl/components/extract_image_licenses/src/main.py
@@ -0,0 +1,88 @@
+import re
+import logging
+
+import pandas as pd
+from bs4 import BeautifulSoup
+from typing import List
+
+from fondant.component import PandasTransformComponent
+
+from utils.license_utils import get_license_type, get_license_location
+from utils.image_utils import get_images_from_soup, get_unique_images
+
+logger = logging.getLogger(__name__)
+
+
+def get_image_info_from_webpage(webpage_url: str, webpage_html: str) -> List[List[str]]:
+    """Extracts image urls and license metadata from the parsed html code.
+    Args:
+        webpage_url: The url of the webpage.
+        webpage_html: The html content of the webpage.
+    Returns:
+        A list of image urls and license metadata.
+    """
+
+    try:
+        soup = BeautifulSoup(webpage_html, "html.parser")
+        for a_tag in soup.find_all("a"):
+            if a_tag.has_attr("href"):
+                license_type = get_license_type(a_tag)
+                if license_type is not None:
+                    license_location = get_license_location(a_tag)
+
+                    if license_location is None:
+                        continue
+                    logger.info(
+                        f"Found license type: {license_type} at {license_location}"
+                    )
+                    images = get_images_from_soup(
+                        soup, webpage_url, license_type, license_location
+                    )
+                    logger.info(f"Found {len(images)} images.")
+
+                    unique_images = get_unique_images(images)
+                    logger.info(f"Found {len(unique_images)} unique images.")
+
+                    return unique_images
+
+    except Exception as e:
+        logger.error(f"Error parsing HTML: {e}")
+        return None
+
+
+class ExtractImageLicenses(PandasTransformComponent):
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Extracts image url and license from the HTML content.
+        Args:
+            df: A pandas dataframe with the webpage url and html content.
+        Returns:
+            A pandas dataframe with the image url and license metadata.
+        """
+        df = (
+            df.apply(
+                lambda row: get_image_info_from_webpage(
+                    row[("webpage", "url")], row[("webpage", "html")]
+                ),
+                axis=1,
+                result_type="expand",
+            )
+            .explode(0)
+            .apply(pd.Series)
+        )
+
+        df = df.dropna()
+
+        df.columns = [
+            ("image", "image_url"),
+            ("image", "alt_text"),
+            ("image", "webpage_url"),
+            ("image", "license_type"),
+            ("image", "license_location"),
+        ]
+
+        return df
+
+
+if __name__ == "__main__":
+    component = ExtractImageLicenses.from_args()
+    component.run()
diff --git a/examples/pipelines/commoncrawl/components/extract_image_licenses/src/utils/image_utils.py b/examples/pipelines/commoncrawl/components/extract_image_licenses/src/utils/image_utils.py
@@ -0,0 +1,83 @@
+import logging
+from urllib.parse import urlparse
+from typing import Any, List
+
+logger = logging.getLogger(__name__)
+
+
+def get_full_image_url(image_url: str, webpage_url: str) -> str:
+    """Returns the full image url if not already provided.
+    Args:
+        image_url: The image url.
+        webpage_url: The url of the webpage.
+    Returns:
+        The full image url.
+    """
+    if "http" not in image_url or image_url[0] == "/":
+        parsed_webpage_url = urlparse(webpage_url)
+        image_url = (
+            f"{parsed_webpage_url.scheme}://{parsed_webpage_url.netloc}{image_url}"
+        )
+
+        try:
+            pos = image_url.index("?")
+            image_url = image_url[:pos]
+        except:
+            pass
+
+    return image_url
+
+
+def get_image_info(
+    a_tag: Any, webpage_url: str, license_type: str, license_location: str
+) -> List[str]:
+    """Returns the image url, alt text, webpage url, and license type.
+    Args:
+        a_tag: The parsed html code.
+        webpage_url: The url of the webpage.
+        license_type: The license type.
+    Returns:
+        A list of image url, alt text, webpage url, and license type.
+    """
+    img_tag = a_tag.find("img")
+
+    if img_tag and img_tag.has_attr("src"):
+        img_src = get_full_image_url(img_tag["src"], webpage_url)
+        img_alt = img_tag.get("alt", "")
+        return [img_src, img_alt, webpage_url, license_type, license_location]
+
+    return None
+
+
+def get_images_from_soup(
+    soup: Any, webpage_url: str, license_type: str, license_location: str
+) -> List[List[str]]:
+    """Returns a list of image urls from the parsed html code.
+    Args:
+        soup: The parsed html code.
+        webpage_url: The url of the webpage.
+        license_type: The license type.
+    Returns:
+        A list of image urls."""
+    image_info = []
+    for a_tag in soup.find_all("a"):
+        img_info = get_image_info(a_tag, webpage_url, license_type, license_location)
+        if img_info:
+            image_info.append(img_info)
+
+    logger.info(f"Found {len(image_info)} images.")
+    return image_info
+
+
+def get_unique_images(images: List[List[str]]) -> List[List[str]]:
+    """Returns a list of unique images.
+    Args:
+        images: A list of images.
+    Returns:
+        A list of unique images.
+    """
+    unique_images = []
+    for image in images:
+        if image not in unique_images:
+            unique_images.append(image)
+    return unique_images
diff --git a/examples/pipelines/commoncrawl/components/extract_image_licenses/src/utils/license_utils.py b/examples/pipelines/commoncrawl/components/extract_image_licenses/src/utils/license_utils.py
@@ -0,0 +1,79 @@
+import re
+import logging
+from typing import Any
+
+from urllib.parse import urlparse
+
+logger = logging.getLogger(__name__)
+
+
+def get_license_location(element: Any) -> str:
+    """Returns the license location from the parsed html code.
+    Args:
+        element: The parsed html code.
+    Returns:
+        The license location.
+    """
+    parent = element.parent
+
+    if parent is None:  # could not find an apprioriate tag
+        return None
+
+    if (
+        parent.name == "footer"
+        or parent.find("div", id="footer")
+        or parent.find("div", class_="footer")
+    ):
+        return "footer"
+    elif (
+        parent.name == "aside"
+        or parent.find("div", id="aside")
+        or parent.find("div", class_="aside")
+    ):
+        return "aside"
+    elif (
+        parent.name == "sidebar"
+        or parent.find("div", id="sidebar")
+        or parent.find("div", class_="sidebar")
+    ):
+        return "sidebar"
+    else:
+        return get_license_location(parent)
+
+
+def get_license_type_from_creative_commons_url(license_url: str) -> str:
+    """Returns the license type from the creative commons url.
+    Args:
+        license_url: The creative commons url.
+    Returns:
+        The license type.
+    """
+    license_split = urlparse(license_url).path.split("/")
+    logger.info(f"license_split: {license_split}")
+
+    if "publicdomain" in license_split:
+        return "public domain"
+    else:
+        license = [l for l in license_split if "by" in l]
+        return license[0]
+
+
+def get_license_type_from_fandom_url(a_tag: Any) -> str:
+    return a_tag.text
+
+
+def get_license_type(a_tag: Any) -> str:
+    """Returns the license type from the parsed html code.
+    Args:
+        a_tag: The parsed html code.
+    Returns:
+        The license type.
+    """
+    href = a_tag.get("href")
+
+    if "fandom.com/licensing" in href:
+        return get_license_type_from_fandom_url(a_tag)
+    elif "creativecommons.org" in href:
+        return get_license_type_from_creative_commons_url(href)
+    else:
+        return None
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		beautifulsoup4==4.12.2
		git+https://github.com/ml6team/fondant@main