diff --git a/examples/pipelines/commoncrawl/components/extract_image_licenses/Dockerfile b/examples/pipelines/commoncrawl/components/extract_image_licenses/Dockerfile new file mode 100644 index 000000000..605adc7e9 --- /dev/null +++ b/examples/pipelines/commoncrawl/components/extract_image_licenses/Dockerfile @@ -0,0 +1,18 @@ +FROM --platform=linux/amd64 python:3.8-slim + +## System dependencies +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install git -y + +# install requirements +COPY requirements.txt / +RUN pip3 install --no-cache-dir -r requirements.txt + +# Set the working directory to the component folder +WORKDIR /component/src + +# Copy over src-files +COPY src/ . + +ENTRYPOINT ["python", "main.py"] \ No newline at end of file diff --git a/examples/pipelines/commoncrawl/components/extract_image_licenses/README.md b/examples/pipelines/commoncrawl/components/extract_image_licenses/README.md new file mode 100644 index 000000000..fb6c51f07 --- /dev/null +++ b/examples/pipelines/commoncrawl/components/extract_image_licenses/README.md @@ -0,0 +1,8 @@ +# extract_image_licenses + +### Description +This components extracts image url and license metadata from a dataframe of webpage url and html code. + +### **Inputs/Outputs** + +See [`fondant_component.yaml`](fondant_component.yaml) for a more detailed description on all the input/output parameters. diff --git a/examples/pipelines/commoncrawl/components/extract_image_licenses/fondant_component.yaml b/examples/pipelines/commoncrawl/components/extract_image_licenses/fondant_component.yaml new file mode 100644 index 000000000..e172e5b7b --- /dev/null +++ b/examples/pipelines/commoncrawl/components/extract_image_licenses/fondant_component.yaml @@ -0,0 +1,25 @@ +name: Extract image url and license from commoncrawl +description: Component that extracts image url and license metadata from a dataframe of webpage urls and html codes +image: ghcr.io/ml6team/extract_image_licenses:latest + +consumes: + webpage: + fields: + url: + type: string + html: + type: string + +produces: + image: + fields: + image_url: + type: string + alt_text: + type: string + webpage_url: + type: string + license_type: + type: string + license_location: + type: string diff --git a/examples/pipelines/commoncrawl/components/extract_image_licenses/requirements.txt b/examples/pipelines/commoncrawl/components/extract_image_licenses/requirements.txt new file mode 100644 index 000000000..ceaf0c748 --- /dev/null +++ b/examples/pipelines/commoncrawl/components/extract_image_licenses/requirements.txt @@ -0,0 +1,2 @@ +beautifulsoup4==4.12.2 +git+https://github.com/ml6team/fondant@main \ No newline at end of file diff --git a/examples/pipelines/commoncrawl/components/extract_image_licenses/src/main.py b/examples/pipelines/commoncrawl/components/extract_image_licenses/src/main.py new file mode 100644 index 000000000..1451c779d --- /dev/null +++ b/examples/pipelines/commoncrawl/components/extract_image_licenses/src/main.py @@ -0,0 +1,88 @@ +import re +import logging + +import pandas as pd +from bs4 import BeautifulSoup +from typing import List + +from fondant.component import PandasTransformComponent + +from utils.license_utils import get_license_type, get_license_location +from utils.image_utils import get_images_from_soup, get_unique_images + +logger = logging.getLogger(__name__) + + +def get_image_info_from_webpage(webpage_url: str, webpage_html: str) -> List[List[str]]: + """Extracts image urls and license metadata from the parsed html code. + Args: + webpage_url: The url of the webpage. + webpage_html: The html content of the webpage. + Returns: + A list of image urls and license metadata. + """ + + try: + soup = BeautifulSoup(webpage_html, "html.parser") + for a_tag in soup.find_all("a"): + if a_tag.has_attr("href"): + license_type = get_license_type(a_tag) + if license_type is not None: + license_location = get_license_location(a_tag) + + if license_location is None: + continue + logger.info( + f"Found license type: {license_type} at {license_location}" + ) + images = get_images_from_soup( + soup, webpage_url, license_type, license_location + ) + logger.info(f"Found {len(images)} images.") + + unique_images = get_unique_images(images) + logger.info(f"Found {len(unique_images)} unique images.") + + return unique_images + + except Exception as e: + logger.error(f"Error parsing HTML: {e}") + return None + + +class ExtractImageLicenses(PandasTransformComponent): + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """Extracts image url and license from the HTML content. + Args: + df: A pandas dataframe with the webpage url and html content. + Returns: + A pandas dataframe with the image url and license metadata. + """ + df = ( + df.apply( + lambda row: get_image_info_from_webpage( + row[("webpage", "url")], row[("webpage", "html")] + ), + axis=1, + result_type="expand", + ) + .explode(0) + .apply(pd.Series) + ) + + df = df.dropna() + + df.columns = [ + ("image", "image_url"), + ("image", "alt_text"), + ("image", "webpage_url"), + ("image", "license_type"), + ("image", "license_location"), + ] + + return df + + +if __name__ == "__main__": + component = ExtractImageLicenses.from_args() + component.run() diff --git a/examples/pipelines/commoncrawl/components/extract_image_licenses/src/utils/image_utils.py b/examples/pipelines/commoncrawl/components/extract_image_licenses/src/utils/image_utils.py new file mode 100644 index 000000000..21599ebfd --- /dev/null +++ b/examples/pipelines/commoncrawl/components/extract_image_licenses/src/utils/image_utils.py @@ -0,0 +1,83 @@ +import logging +from urllib.parse import urlparse +from typing import Any, List + +logger = logging.getLogger(__name__) + + +def get_full_image_url(image_url: str, webpage_url: str) -> str: + """Returns the full image url if not already provided. + Args: + image_url: The image url. + webpage_url: The url of the webpage. + Returns: + The full image url. + """ + if "http" not in image_url or image_url[0] == "/": + parsed_webpage_url = urlparse(webpage_url) + image_url = ( + f"{parsed_webpage_url.scheme}://{parsed_webpage_url.netloc}{image_url}" + ) + + try: + pos = image_url.index("?") + image_url = image_url[:pos] + except: + pass + + return image_url + + +def get_image_info( + a_tag: Any, webpage_url: str, license_type: str, license_location: str +) -> List[str]: + """Returns the image url, alt text, webpage url, and license type. + Args: + a_tag: The parsed html code. + webpage_url: The url of the webpage. + license_type: The license type. + Returns: + A list of image url, alt text, webpage url, and license type. + """ + img_tag = a_tag.find("img") + + if img_tag and img_tag.has_attr("src"): + img_src = get_full_image_url(img_tag["src"], webpage_url) + img_alt = img_tag.get("alt", "") + return [img_src, img_alt, webpage_url, license_type, license_location] + + return None + + +def get_images_from_soup( + soup: Any, webpage_url: str, license_type: str, license_location: str +) -> List[List[str]]: + """Returns a list of image urls from the parsed html code. + Args: + soup: The parsed html code. + webpage_url: The url of the webpage. + license_type: The license type. + Returns: + A list of image urls.""" + image_info = [] + for a_tag in soup.find_all("a"): + img_info = get_image_info(a_tag, webpage_url, license_type, license_location) + if img_info: + image_info.append(img_info) + + logger.info(f"Found {len(image_info)} images.") + return image_info + + +def get_unique_images(images: List[List[str]]) -> List[List[str]]: + """Returns a list of unique images. + Args: + images: A list of images. + Returns: + A list of unique images. + """ + unique_images = [] + for image in images: + if image not in unique_images: + unique_images.append(image) + return unique_images diff --git a/examples/pipelines/commoncrawl/components/extract_image_licenses/src/utils/license_utils.py b/examples/pipelines/commoncrawl/components/extract_image_licenses/src/utils/license_utils.py new file mode 100644 index 000000000..b11f7e12a --- /dev/null +++ b/examples/pipelines/commoncrawl/components/extract_image_licenses/src/utils/license_utils.py @@ -0,0 +1,79 @@ +import re +import logging +from typing import Any + +from urllib.parse import urlparse + +logger = logging.getLogger(__name__) + + +def get_license_location(element: Any) -> str: + """Returns the license location from the parsed html code. + Args: + element: The parsed html code. + Returns: + The license location. + """ + parent = element.parent + + if parent is None: # could not find an apprioriate tag + return None + + if ( + parent.name == "footer" + or parent.find("div", id="footer") + or parent.find("div", class_="footer") + ): + return "footer" + elif ( + parent.name == "aside" + or parent.find("div", id="aside") + or parent.find("div", class_="aside") + ): + return "aside" + elif ( + parent.name == "sidebar" + or parent.find("div", id="sidebar") + or parent.find("div", class_="sidebar") + ): + return "sidebar" + else: + return get_license_location(parent) + + +def get_license_type_from_creative_commons_url(license_url: str) -> str: + """Returns the license type from the creative commons url. + Args: + license_url: The creative commons url. + Returns: + The license type. + """ + license_split = urlparse(license_url).path.split("/") + logger.info(f"license_split: {license_split}") + + if "publicdomain" in license_split: + return "public domain" + else: + license = [l for l in license_split if "by" in l] + return license[0] + + +def get_license_type_from_fandom_url(a_tag: Any) -> str: + return a_tag.text + + +def get_license_type(a_tag: Any) -> str: + """Returns the license type from the parsed html code. + Args: + a_tag: The parsed html code. + Returns: + The license type. + """ + href = a_tag.get("href") + + if "fandom.com/licensing" in href: + return get_license_type_from_fandom_url(a_tag) + elif "creativecommons.org" in href: + return get_license_type_from_creative_commons_url(href) + else: + return None