Improve commoncrawl components (#403)

Improved commoncrawl download components for the license-free image use case.
ml6team · Sep 13, 2023 · eec81a5 · eec81a5
1 parent 89e8cde
commit eec81a5
Show file tree

Hide file tree

Showing 13 changed files with 41 additions and 28 deletions.
diff --git a/...components/download_warc_files/Dockerfile → ...nents/extract_images_from_warc/Dockerfile b/...components/download_warc_files/Dockerfile → ...nents/extract_images_from_warc/Dockerfile
diff --git a/.../components/download_warc_files/README.md → ...onents/extract_images_from_warc/README.md b/.../components/download_warc_files/README.md → ...onents/extract_images_from_warc/README.md
diff --git a/...ownload_warc_files/fondant_component.yaml → ...t_images_from_warc/fondant_component.yaml b/...ownload_warc_files/fondant_component.yaml → ...t_images_from_warc/fondant_component.yaml
@@ -1,6 +1,6 @@
-name: download warc component
-description: A component that downloads parts of the common crawl
-image: ghcr.io/ml6team/common_crawl_download_warc:cadb918
+name: Extract image licenses from warc
+description: A component that extracts images and their licenses from warc files
+image: ghcr.io/ml6team/extract_images_from_warc:d4619b5
 
 consumes:
   warc:

diff --git a/...ents/download_warc_files/requirements.txt → ...extract_images_from_warc/requirements.txt b/...ents/download_warc_files/requirements.txt → ...extract_images_from_warc/requirements.txt
@@ -1,3 +1,4 @@
 trafilatura==1.6.1
 beautifulsoup4==4.12.2
-fastwarc
+fastwarc==0.14.5
+distributed==2023.8.1
diff --git a/...omponents/download_warc_files/src/main.py → ...ents/extract_images_from_warc/src/main.py b/...omponents/download_warc_files/src/main.py → ...ents/extract_images_from_warc/src/main.py
@@ -2,18 +2,21 @@
 import logging
 import typing as t
 
+import dask
 import dask.dataframe as dd
 import pandas as pd
 from bs4 import BeautifulSoup
 from fondant.component import DaskTransformComponent
-from fastwarc.warc import ArchiveIterator, WarcRecordType
+from fastwarc import ArchiveIterator, StreamError, WarcRecordType
 
 from utils.download_utils import download_warc_file
 from utils.license_utils import get_license_type, get_license_location
 from utils.image_utils import get_images_from_soup, get_unique_images
 
 logger = logging.getLogger(__name__)
 
+dask.config.set(scheduler="processes")
+
 CC_BASE_URL = "http://data.commoncrawl.org"
 
 
@@ -72,17 +75,20 @@ def filter_(record):
                 return False
             return True
 
-        for record in ArchiveIterator(
-            file,
-            record_types=WarcRecordType.response,
-            func_filter=filter_,
-        ):
-            url = record.headers.get("WARC-Target-URI")
-            content = record.reader.read().decode("utf-8", "replace")
-            if content:
-                image_info = self.get_image_info_from_webpage(url, content)
-                if image_info:
-                    images.extend(image_info)
+        try:
+            for record in ArchiveIterator(
+                file,
+                record_types=WarcRecordType.response,
+                func_filter=filter_,
+            ):
+                url = record.headers.get("WARC-Target-URI")
+                content = record.reader.read().decode("utf-8", "replace")
+                if content:
+                    image_info = self.get_image_info_from_webpage(url, content)
+                    if image_info:
+                        images.extend(image_info)
+        except StreamError as e:
+            logging.warning(e)
 
         return images
 
@@ -97,8 +103,12 @@ def download_and_extract_warc(
         """
         logger.warning(f"Processing WARC file: {warc_file}...")
 
-        response = download_warc_file(warc_file)
-        return self.extract_images(response.raw)
+        try:
+            response = download_warc_file(warc_file)
+            return self.extract_images(response.raw)
+        except BaseException as e:
+            logging.warning(e)
+            return []
 
     def download_and_extract_dataframe(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         """Download and extract all warc files in a dataframe."""

diff --git a/...ad_warc_files/src/utils/download_utils.py → ...ges_from_warc/src/utils/download_utils.py b/...ad_warc_files/src/utils/download_utils.py → ...ges_from_warc/src/utils/download_utils.py
@@ -12,7 +12,7 @@
 
 
 def download_warc_file(
-    warc_file: str, retries: int = 3, backoff_factor: int = 5
+    warc_file: str, retries: int = 10, backoff_factor: int = 5
 ) -> requests.Response:
     """Downloads a WARC file using http requests.
     Args:
@@ -33,6 +33,6 @@ def download_warc_file(
         response = session.get(COMMONCRAWL_BASE_URL + warc_file, stream=True)
         response.raise_for_status()
         return response
-    except requests.exceptions.RequestException as e:
+    except Exception as e:
         logger.error(f"Error downloading WARC file: {e}")
         raise
diff --git a/...nload_warc_files/src/utils/image_utils.py → ...images_from_warc/src/utils/image_utils.py b/...nload_warc_files/src/utils/image_utils.py → ...images_from_warc/src/utils/image_utils.py
diff --git a/...oad_warc_files/src/utils/license_utils.py → ...ages_from_warc/src/utils/license_utils.py b/...oad_warc_files/src/utils/license_utils.py → ...ages_from_warc/src/utils/license_utils.py
diff --git a/examples/pipelines/commoncrawl/components/read_warc_paths/fondant_component.yaml b/examples/pipelines/commoncrawl/components/read_warc_paths/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Common crawl download component
 description: A component that downloads parts of the common crawl
-image: ghcr.io/ml6team/read_warc_paths:dev
+image: ghcr.io/ml6team/read_warc_paths:57404ff
 
 produces:
   warc:

diff --git a/examples/pipelines/commoncrawl/components/read_warc_paths/src/main.py b/examples/pipelines/commoncrawl/components/read_warc_paths/src/main.py
@@ -41,4 +41,4 @@ def load(self) -> dd.DataFrame:
         if self.n_records_to_download is not None:
             df = df.head(self.n_records_to_download)
 
-        return dd.from_pandas(df, npartitions=len(df))
+        return dd.from_pandas(df, npartitions=len(df) // 100)
diff --git a/examples/pipelines/commoncrawl/pipeline.py b/examples/pipelines/commoncrawl/pipeline.py
@@ -14,14 +14,17 @@
 
 read_warc_paths_op = ComponentOp(
     component_dir="components/read_warc_paths",
-    arguments={"common_crawl_indices": ["CC-MAIN-2023-23"], "n_records_to_download": 1},
+    arguments={"common_crawl_indices": ["CC-MAIN-2023-06"]},
+    cache=False,
 )
 
-load_warc_files_op = ComponentOp(
-    component_dir="components/download_warc_files",
+extract_images_op = ComponentOp(
+    component_dir="components/extract_images_from_warc",
+    node_pool_label="node_pool",
+    node_pool_name="n2-standard-128-pool-3",
 )
 
 pipeline = Pipeline(pipeline_name=pipeline_name, base_path=PipelineConfigs.BASE_PATH)
 
 pipeline.add_op(read_warc_paths_op)
-pipeline.add_op(load_warc_files_op, dependencies=[read_warc_paths_op])
+pipeline.add_op(extract_images_op, dependencies=[read_warc_paths_op])
diff --git a/src/fondant/cli.py b/src/fondant/cli.py
@@ -344,7 +344,7 @@ def run(args):
         else:
             spec_ref = args.output_path
             logging.info(
-                "Found reference to un-compiled pipeline... compiling to {spec_ref}",
+                f"Found reference to un-compiled pipeline... compiling to {spec_ref}",
             )
             compiler = KubeFlowCompiler()
             compiler.compile(pipeline=pipeline, output_path=spec_ref)

diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
@@ -261,7 +261,6 @@ def _create_write_task(
             schema=schema,
             overwrite=False,
             compute=False,
-            write_metadata_file=True,
         )
         logging.info(f"Creating write task for: {location}")
         return write_task