diff --git a/Cargo.toml b/Cargo.toml index c232927..14f93fd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ itertools = "0.10.0" tokio = { version = "1", features = ["full"] } tokio-util = {version="0.6.6", features=["compat"]} warc = {version="0.3.0", features=["with_serde"]} -ut1_blocklist = "0.1.0" +ut1_blocklist = "0.1.1" fasttext = "0.7.0" bytes = "1" rayon = "1" diff --git a/src/cli.rs b/src/cli.rs index c8948f3..1a1906b 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -171,6 +171,13 @@ pub struct Pipeline { )] pub blocklist: Option, + #[structopt( + parse(from_os_str), + long = "domain-blocklists", + help = "domain-blocklists path. For folders, will treat each file as a different blocklist. For files, filename=annotation. use ut1-blocklist for using ut1 blocklist annotations" + )] + pub domain_blocklists: Option>, + #[structopt( parse(from_os_str), long = "kenlms-path", diff --git a/src/lib.rs b/src/lib.rs index 0978ba4..e14189f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ #![doc = include_str!("../README.md")] +pub(crate) mod cli; pub mod error; pub mod filtering; pub mod identifiers; diff --git a/src/main.rs b/src/main.rs index 09287a9..612fe05 100644 --- a/src/main.rs +++ b/src/main.rs @@ -60,8 +60,14 @@ async fn main() -> Result<(), error::Error> { cli::Ungoliant::Pipeline(p) => { let mut schema_filepath = p.dst.clone(); // let p = pipeline::OscarMetadata::new(p.src, p.dst, p.lid_path); - let p = - pipelines::OscarDocNew::new(p.src, p.dst, p.lid_path, p.blocklist, p.kenlms_path); + let p = pipelines::OscarDocNew::new( + p.src, + p.dst, + p.lid_path, + p.blocklist, + p.domain_blocklists, + p.kenlms_path, + ); p.run()?; schema_filepath.push("metadata_schema.json"); diff --git a/src/pipelines/oscardoc/pipeline.rs b/src/pipelines/oscardoc/pipeline.rs index ab74c7b..bf16800 100644 --- a/src/pipelines/oscardoc/pipeline.rs +++ b/src/pipelines/oscardoc/pipeline.rs @@ -51,11 +51,14 @@ use warc::{Record, WarcHeader}; use crate::io::LangFilesDoc; const DOC_THRESHOLD: f32 = 0.6f32; + +// TODO: Implement structopt directly here. pub struct OscarDoc { src: PathBuf, dst: PathBuf, lid_path: PathBuf, blocklist: Option, + domain_blocklists: Option>, kenlms_path: Option, } @@ -65,6 +68,7 @@ impl OscarDoc { dst: PathBuf, lid_path: PathBuf, blocklist: Option, + domain_blocklists: Option>, kenlms_path: Option, ) -> Self { if blocklist.is_none() { @@ -77,6 +81,7 @@ impl OscarDoc { dst, lid_path, blocklist, + domain_blocklists, kenlms_path, } } @@ -128,6 +133,7 @@ impl OscarDoc { identifier: &FastText, filter: Option, blocklist: &Option, + domain_blocklists: &Option>, ) -> Result<(usize, Vec<(Document, Location)>), Error> { info!("working on shard: {:?}", shard_path); @@ -220,10 +226,31 @@ impl OscarDoc { .add(Box::new(Noisy::default())); // TODO: Same here, we instantiate it once by shard + // add ut1 blocklist adult annotation if let Some(path) = blocklist { - let bl = Blocklist::with_folder("adult", path)?; + let bl = Blocklist::with_folder("adult".to_string(), path)?; annotator.add(Box::new(ContentDetector::new(bl))); } + + // add other (custom) blocklists + if let Some(paths) = domain_blocklists { + for path in paths { + if path.is_file() { + let annotation = path + .file_name() + .map(|filename| filename.to_string_lossy().to_string()); + if let Some(annotation) = annotation { + let bl = Blocklist::from_domains_file(annotation, path)?; + info!("added content detector for annotation from {path:?}"); + info!("domains: {:?}", bl.domains()); + annotator.add(Box::new(ContentDetector::new(bl))); + } else { + error!("Could not get annotation for blocklist {path:?}, skipping"); + } + } + } + } + annotator }; @@ -480,7 +507,7 @@ impl Pipeline<()> for OscarDoc { let shards_results = results.map(|(idx, shard)| { ( idx, - Self::process_shard(&shard, &cls, None, &self.blocklist), + Self::process_shard(&shard, &cls, None, &self.blocklist, &self.domain_blocklists), ) }); diff --git a/src/transformers/content_detector.rs b/src/transformers/content_detector.rs index 8ea6049..2986504 100644 --- a/src/transformers/content_detector.rs +++ b/src/transformers/content_detector.rs @@ -14,13 +14,13 @@ use url::Url; use super::Annotate; -pub struct ContentDetector<'a> { - bl: Blocklist<'a>, +pub struct ContentDetector { + bl: Blocklist, } -impl<'a> ContentDetector<'a> { +impl ContentDetector { /// Create a new [ContentDetector] based on a specified [Blocklist]. - pub fn new(bl: Blocklist<'a>) -> Self { + pub fn new(bl: Blocklist) -> Self { Self { bl } } @@ -40,7 +40,7 @@ impl<'a> ContentDetector<'a> { } } -impl<'a> Annotate for ContentDetector<'a> { +impl Annotate for ContentDetector { /// Checks if domain/url is present in provided blocklist, and adds a tag /// corresponding to blocklist kind if true. fn annotate(&self, doc: &mut Document) { @@ -50,7 +50,7 @@ impl<'a> Annotate for ContentDetector<'a> { // if we were successful, detect domain and url if let Some(valid_url) = url { if self.bl.detect_domain(&valid_url) || self.bl.detect_url(&valid_url) { - debug!("Document {} flagged as adult", doc.warc_id()); + debug!("Document {} flagged as {}", doc.warc_id(), self.bl.kind()); doc.metadata_mut() .add_annotation(self.bl.kind().to_string()); } @@ -102,7 +102,7 @@ mod tests { let mut domains = HashSet::new(); domains.insert("foo.bar".to_string()); - let bl = Blocklist::new("adult", domains, HashSet::new()); + let bl = Blocklist::new("adult".to_string(), domains, HashSet::new()); let cd = ContentDetector::new(bl); cd.annotate(&mut doc); @@ -120,7 +120,7 @@ mod tests { let mut domains = HashSet::new(); domains.insert("baz.quux".to_string()); - let bl = Blocklist::new("adult", domains, HashSet::new()); + let bl = Blocklist::new("adult".to_string(), domains, HashSet::new()); let cd = ContentDetector::new(bl); cd.annotate(&mut doc); diff --git a/src/transformers/lsh.rs b/src/transformers/lsh.rs index 4ea344c..3f443a5 100644 --- a/src/transformers/lsh.rs +++ b/src/transformers/lsh.rs @@ -7,7 +7,7 @@ use crate::pipelines::oscardoc::types::Document; use warc::WarcHeader; use super::Annotate; -use log::warn; +use log::{debug, warn}; pub struct LSH { builder: TlshBuilder, } @@ -26,7 +26,7 @@ impl Annotate for LSH { let annotation = format!("tlsh:{}", hash.hash()); doc.metadata_mut().add_annotation(annotation); } - Err(e) => warn!( + Err(e) => debug!( "Could not compute a hash for document {:?}: {:?}", String::from_utf8_lossy( doc.warc_headers() diff --git a/tests/oscardoc_rebuild.rs b/tests/oscardoc_rebuild.rs index c0bd219..4c9796c 100644 --- a/tests/oscardoc_rebuild.rs +++ b/tests/oscardoc_rebuild.rs @@ -25,7 +25,8 @@ fn gen_corpus() { let bl = Path::new("res/blocklist/").to_path_buf(); let kenlm = Path::new("res/kenlm/").to_path_buf(); - let pipeline = OscarDoc::new(src, dst, lid, Some(bl), Some(kenlm)); + //TODO test with custom blocklists + let pipeline = OscarDoc::new(src, dst, lid, Some(bl), None, Some(kenlm)); pipeline.run().expect( "Ensure to have shards in res/shards, lid.176.bin at root and blocklist at res/blocklist", );