Skip to content

Commit

Permalink
[Commoncrawl pipeline] Add component extract free-to-use images (#282)
Browse files Browse the repository at this point in the history
This 3rd component extracts the image url, alt text and license metadata
from the webpage url and html code.
  • Loading branch information
shayorshay authored Jul 20, 2023
1 parent be7bd0b commit b8bfaef
Show file tree
Hide file tree
Showing 7 changed files with 303 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
FROM --platform=linux/amd64 python:3.8-slim

## System dependencies
RUN apt-get update && \
apt-get upgrade -y && \
apt-get install git -y

# install requirements
COPY requirements.txt /
RUN pip3 install --no-cache-dir -r requirements.txt

# Set the working directory to the component folder
WORKDIR /component/src

# Copy over src-files
COPY src/ .

ENTRYPOINT ["python", "main.py"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# extract_image_licenses

### Description
This components extracts image url and license metadata from a dataframe of webpage url and html code.

### **Inputs/Outputs**

See [`fondant_component.yaml`](fondant_component.yaml) for a more detailed description on all the input/output parameters.
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: Extract image url and license from commoncrawl
description: Component that extracts image url and license metadata from a dataframe of webpage urls and html codes
image: ghcr.io/ml6team/extract_image_licenses:latest

consumes:
webpage:
fields:
url:
type: string
html:
type: string

produces:
image:
fields:
image_url:
type: string
alt_text:
type: string
webpage_url:
type: string
license_type:
type: string
license_location:
type: string
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
beautifulsoup4==4.12.2
git+https://github.com/ml6team/fondant@main
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import re
import logging

import pandas as pd
from bs4 import BeautifulSoup
from typing import List

from fondant.component import PandasTransformComponent

from utils.license_utils import get_license_type, get_license_location
from utils.image_utils import get_images_from_soup, get_unique_images

logger = logging.getLogger(__name__)


def get_image_info_from_webpage(webpage_url: str, webpage_html: str) -> List[List[str]]:
"""Extracts image urls and license metadata from the parsed html code.
Args:
webpage_url: The url of the webpage.
webpage_html: The html content of the webpage.
Returns:
A list of image urls and license metadata.
"""

try:
soup = BeautifulSoup(webpage_html, "html.parser")
for a_tag in soup.find_all("a"):
if a_tag.has_attr("href"):
license_type = get_license_type(a_tag)
if license_type is not None:
license_location = get_license_location(a_tag)

if license_location is None:
continue
logger.info(
f"Found license type: {license_type} at {license_location}"
)
images = get_images_from_soup(
soup, webpage_url, license_type, license_location
)
logger.info(f"Found {len(images)} images.")

unique_images = get_unique_images(images)
logger.info(f"Found {len(unique_images)} unique images.")

return unique_images

except Exception as e:
logger.error(f"Error parsing HTML: {e}")
return None


class ExtractImageLicenses(PandasTransformComponent):
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""Extracts image url and license from the HTML content.
Args:
df: A pandas dataframe with the webpage url and html content.
Returns:
A pandas dataframe with the image url and license metadata.
"""
df = (
df.apply(
lambda row: get_image_info_from_webpage(
row[("webpage", "url")], row[("webpage", "html")]
),
axis=1,
result_type="expand",
)
.explode(0)
.apply(pd.Series)
)

df = df.dropna()

df.columns = [
("image", "image_url"),
("image", "alt_text"),
("image", "webpage_url"),
("image", "license_type"),
("image", "license_location"),
]

return df


if __name__ == "__main__":
component = ExtractImageLicenses.from_args()
component.run()
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import logging
from urllib.parse import urlparse
from typing import Any, List

logger = logging.getLogger(__name__)


def get_full_image_url(image_url: str, webpage_url: str) -> str:
"""Returns the full image url if not already provided.
Args:
image_url: The image url.
webpage_url: The url of the webpage.
Returns:
The full image url.
"""
if "http" not in image_url or image_url[0] == "/":
parsed_webpage_url = urlparse(webpage_url)
image_url = (
f"{parsed_webpage_url.scheme}://{parsed_webpage_url.netloc}{image_url}"
)

try:
pos = image_url.index("?")
image_url = image_url[:pos]
except:
pass

return image_url


def get_image_info(
a_tag: Any, webpage_url: str, license_type: str, license_location: str
) -> List[str]:
"""Returns the image url, alt text, webpage url, and license type.
Args:
a_tag: The parsed html code.
webpage_url: The url of the webpage.
license_type: The license type.
Returns:
A list of image url, alt text, webpage url, and license type.
"""
img_tag = a_tag.find("img")

if img_tag and img_tag.has_attr("src"):
img_src = get_full_image_url(img_tag["src"], webpage_url)
img_alt = img_tag.get("alt", "")
return [img_src, img_alt, webpage_url, license_type, license_location]

return None


def get_images_from_soup(
soup: Any, webpage_url: str, license_type: str, license_location: str
) -> List[List[str]]:
"""Returns a list of image urls from the parsed html code.
Args:
soup: The parsed html code.
webpage_url: The url of the webpage.
license_type: The license type.
Returns:
A list of image urls."""
image_info = []
for a_tag in soup.find_all("a"):
img_info = get_image_info(a_tag, webpage_url, license_type, license_location)
if img_info:
image_info.append(img_info)

logger.info(f"Found {len(image_info)} images.")
return image_info


def get_unique_images(images: List[List[str]]) -> List[List[str]]:
"""Returns a list of unique images.
Args:
images: A list of images.
Returns:
A list of unique images.
"""
unique_images = []
for image in images:
if image not in unique_images:
unique_images.append(image)
return unique_images
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import re
import logging
from typing import Any

from urllib.parse import urlparse

logger = logging.getLogger(__name__)


def get_license_location(element: Any) -> str:
"""Returns the license location from the parsed html code.
Args:
element: The parsed html code.
Returns:
The license location.
"""
parent = element.parent

if parent is None: # could not find an apprioriate tag
return None

if (
parent.name == "footer"
or parent.find("div", id="footer")
or parent.find("div", class_="footer")
):
return "footer"
elif (
parent.name == "aside"
or parent.find("div", id="aside")
or parent.find("div", class_="aside")
):
return "aside"
elif (
parent.name == "sidebar"
or parent.find("div", id="sidebar")
or parent.find("div", class_="sidebar")
):
return "sidebar"
else:
return get_license_location(parent)


def get_license_type_from_creative_commons_url(license_url: str) -> str:
"""Returns the license type from the creative commons url.
Args:
license_url: The creative commons url.
Returns:
The license type.
"""
license_split = urlparse(license_url).path.split("/")
logger.info(f"license_split: {license_split}")

if "publicdomain" in license_split:
return "public domain"
else:
license = [l for l in license_split if "by" in l]
return license[0]


def get_license_type_from_fandom_url(a_tag: Any) -> str:
return a_tag.text


def get_license_type(a_tag: Any) -> str:
"""Returns the license type from the parsed html code.
Args:
a_tag: The parsed html code.
Returns:
The license type.
"""
href = a_tag.get("href")

if "fandom.com/licensing" in href:
return get_license_type_from_fandom_url(a_tag)
elif "creativecommons.org" in href:
return get_license_type_from_creative_commons_url(href)
else:
return None

0 comments on commit b8bfaef

Please sign in to comment.