Skip to content

Commit

Permalink
Merge pull request #76 from oscar-project/dev-ja-blocklist
Browse files Browse the repository at this point in the history
feat(blocklists): ability to use multiple blocklists
  • Loading branch information
Uinelj authored Dec 16, 2022
2 parents 6918d3d + 89600b3 commit aa1b918
Show file tree
Hide file tree
Showing 8 changed files with 58 additions and 16 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ itertools = "0.10.0"
tokio = { version = "1", features = ["full"] }
tokio-util = {version="0.6.6", features=["compat"]}
warc = {version="0.3.0", features=["with_serde"]}
ut1_blocklist = "0.1.0"
ut1_blocklist = "0.1.1"
fasttext = "0.7.0"
bytes = "1"
rayon = "1"
Expand Down
7 changes: 7 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,13 @@ pub struct Pipeline {
)]
pub blocklist: Option<PathBuf>,

#[structopt(
parse(from_os_str),
long = "domain-blocklists",
help = "domain-blocklists path. For folders, will treat each file as a different blocklist. For files, filename=annotation. use ut1-blocklist for using ut1 blocklist annotations"
)]
pub domain_blocklists: Option<Vec<PathBuf>>,

#[structopt(
parse(from_os_str),
long = "kenlms-path",
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#![doc = include_str!("../README.md")]
pub(crate) mod cli;
pub mod error;
pub mod filtering;
pub mod identifiers;
Expand Down
10 changes: 8 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,14 @@ async fn main() -> Result<(), error::Error> {
cli::Ungoliant::Pipeline(p) => {
let mut schema_filepath = p.dst.clone();
// let p = pipeline::OscarMetadata::new(p.src, p.dst, p.lid_path);
let p =
pipelines::OscarDocNew::new(p.src, p.dst, p.lid_path, p.blocklist, p.kenlms_path);
let p = pipelines::OscarDocNew::new(
p.src,
p.dst,
p.lid_path,
p.blocklist,
p.domain_blocklists,
p.kenlms_path,
);
p.run()?;

schema_filepath.push("metadata_schema.json");
Expand Down
31 changes: 29 additions & 2 deletions src/pipelines/oscardoc/pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,14 @@ use warc::{Record, WarcHeader};
use crate::io::LangFilesDoc;

const DOC_THRESHOLD: f32 = 0.6f32;

// TODO: Implement structopt directly here.
pub struct OscarDoc {
src: PathBuf,
dst: PathBuf,
lid_path: PathBuf,
blocklist: Option<PathBuf>,
domain_blocklists: Option<Vec<PathBuf>>,
kenlms_path: Option<PathBuf>,
}

Expand All @@ -65,6 +68,7 @@ impl OscarDoc {
dst: PathBuf,
lid_path: PathBuf,
blocklist: Option<PathBuf>,
domain_blocklists: Option<Vec<PathBuf>>,
kenlms_path: Option<PathBuf>,
) -> Self {
if blocklist.is_none() {
Expand All @@ -77,6 +81,7 @@ impl OscarDoc {
dst,
lid_path,
blocklist,
domain_blocklists,
kenlms_path,
}
}
Expand Down Expand Up @@ -128,6 +133,7 @@ impl OscarDoc {
identifier: &FastText,
filter: Option<record::FilterKind>,
blocklist: &Option<PathBuf>,
domain_blocklists: &Option<Vec<PathBuf>>,
) -> Result<(usize, Vec<(Document, Location)>), Error> {
info!("working on shard: {:?}", shard_path);

Expand Down Expand Up @@ -220,10 +226,31 @@ impl OscarDoc {
.add(Box::new(Noisy::default()));

// TODO: Same here, we instantiate it once by shard
// add ut1 blocklist adult annotation
if let Some(path) = blocklist {
let bl = Blocklist::with_folder("adult", path)?;
let bl = Blocklist::with_folder("adult".to_string(), path)?;
annotator.add(Box::new(ContentDetector::new(bl)));
}

// add other (custom) blocklists
if let Some(paths) = domain_blocklists {
for path in paths {
if path.is_file() {
let annotation = path
.file_name()
.map(|filename| filename.to_string_lossy().to_string());
if let Some(annotation) = annotation {
let bl = Blocklist::from_domains_file(annotation, path)?;
info!("added content detector for annotation from {path:?}");
info!("domains: {:?}", bl.domains());
annotator.add(Box::new(ContentDetector::new(bl)));
} else {
error!("Could not get annotation for blocklist {path:?}, skipping");
}
}
}
}

annotator
};

Expand Down Expand Up @@ -480,7 +507,7 @@ impl Pipeline<()> for OscarDoc {
let shards_results = results.map(|(idx, shard)| {
(
idx,
Self::process_shard(&shard, &cls, None, &self.blocklist),
Self::process_shard(&shard, &cls, None, &self.blocklist, &self.domain_blocklists),
)
});

Expand Down
16 changes: 8 additions & 8 deletions src/transformers/content_detector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@ use url::Url;

use super::Annotate;

pub struct ContentDetector<'a> {
bl: Blocklist<'a>,
pub struct ContentDetector {
bl: Blocklist,
}

impl<'a> ContentDetector<'a> {
impl ContentDetector {
/// Create a new [ContentDetector] based on a specified [Blocklist].
pub fn new(bl: Blocklist<'a>) -> Self {
pub fn new(bl: Blocklist) -> Self {
Self { bl }
}

Expand All @@ -40,7 +40,7 @@ impl<'a> ContentDetector<'a> {
}
}

impl<'a> Annotate<Document> for ContentDetector<'a> {
impl Annotate<Document> for ContentDetector {
/// Checks if domain/url is present in provided blocklist, and adds a tag
/// corresponding to blocklist kind if true.
fn annotate(&self, doc: &mut Document) {
Expand All @@ -50,7 +50,7 @@ impl<'a> Annotate<Document> for ContentDetector<'a> {
// if we were successful, detect domain and url
if let Some(valid_url) = url {
if self.bl.detect_domain(&valid_url) || self.bl.detect_url(&valid_url) {
debug!("Document {} flagged as adult", doc.warc_id());
debug!("Document {} flagged as {}", doc.warc_id(), self.bl.kind());
doc.metadata_mut()
.add_annotation(self.bl.kind().to_string());
}
Expand Down Expand Up @@ -102,7 +102,7 @@ mod tests {
let mut domains = HashSet::new();
domains.insert("foo.bar".to_string());

let bl = Blocklist::new("adult", domains, HashSet::new());
let bl = Blocklist::new("adult".to_string(), domains, HashSet::new());
let cd = ContentDetector::new(bl);

cd.annotate(&mut doc);
Expand All @@ -120,7 +120,7 @@ mod tests {
let mut domains = HashSet::new();
domains.insert("baz.quux".to_string());

let bl = Blocklist::new("adult", domains, HashSet::new());
let bl = Blocklist::new("adult".to_string(), domains, HashSet::new());
let cd = ContentDetector::new(bl);

cd.annotate(&mut doc);
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/lsh.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use crate::pipelines::oscardoc::types::Document;
use warc::WarcHeader;

use super::Annotate;
use log::warn;
use log::{debug, warn};
pub struct LSH {
builder: TlshBuilder,
}
Expand All @@ -26,7 +26,7 @@ impl Annotate<Document> for LSH {
let annotation = format!("tlsh:{}", hash.hash());
doc.metadata_mut().add_annotation(annotation);
}
Err(e) => warn!(
Err(e) => debug!(
"Could not compute a hash for document {:?}: {:?}",
String::from_utf8_lossy(
doc.warc_headers()
Expand Down
3 changes: 2 additions & 1 deletion tests/oscardoc_rebuild.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ fn gen_corpus() {
let bl = Path::new("res/blocklist/").to_path_buf();
let kenlm = Path::new("res/kenlm/").to_path_buf();

let pipeline = OscarDoc::new(src, dst, lid, Some(bl), Some(kenlm));
//TODO test with custom blocklists
let pipeline = OscarDoc::new(src, dst, lid, Some(bl), None, Some(kenlm));
pipeline.run().expect(
"Ensure to have shards in res/shards, lid.176.bin at root and blocklist at res/blocklist",
);
Expand Down

0 comments on commit aa1b918

Please sign in to comment.