Skip to content

Commit

Permalink
Merge pull request #14 from oscar-project/dev
Browse files Browse the repository at this point in the history
Change annotation to quality_warning
  • Loading branch information
Uinelj authored Jan 24, 2023
2 parents 529fab9 + 133ea94 commit c93b0ff
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 9 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "oscar-io"
version = "0.1.3"
version = "0.2.1"
edition = "2021"
description = "Readers/Writers for OSCAR Corpora."
documentation = "https://docs.rs/oscar-io"
Expand Down
22 changes: 14 additions & 8 deletions src/v3/types/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,18 @@ type Identification = IdentificationGen<String>;
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]

/// OSCAR-specific metadata
/// TODO: make it a HashMap
/// TODO: make annotation/categories hashmaps
// TODO: make it a HashMap
// TODO: make annotation/categories hashmaps
/// Contains document metadata:
/// - `identification` is the document-level language identification (see [Identification])
/// - `harmful_pp` is the perplexiry of the document, related to a model trained to recognize adult documents
/// - `quality_warnings` (ex-annotation) contains tags for some length/content based quality filters
/// - `categories` contains categories based on the url of the document. Uses the ut1 blocklist as a base.
/// - `sentence_identifiations` contains line-level identifications.
pub struct Metadata {
identification: Identification,
harmful_pp: Option<f32>,
annotation: Option<Vec<String>>,
quality_warnings: Option<Vec<String>>,
categories: Option<Vec<String>>,
sentence_identifications: Vec<Option<Identification>>,
}
Expand All @@ -34,16 +40,16 @@ impl Metadata {
Metadata {
identification: identification.clone(),
harmful_pp: None,
annotation: None,
quality_warnings: None,
categories: None,
sentence_identifications: sentence_identifications.to_owned(),
}
}

pub fn add_annotation(&mut self, annotation: String) {
match &mut self.annotation {
match &mut self.quality_warnings {
Some(anno) => anno.push(annotation),
None => self.annotation = Some(vec![annotation]),
None => self.quality_warnings = Some(vec![annotation]),
}
}

Expand All @@ -62,7 +68,7 @@ impl Metadata {

/// Get a reference to the metadata's annotation.
pub fn annotation(&self) -> Option<&Vec<String>> {
self.annotation.as_ref()
self.quality_warnings.as_ref()
}

/// Get a reference to the metadata's sentence identifications.
Expand All @@ -83,7 +89,7 @@ impl Default for Metadata {
Self {
identification: Identification::new(LanguageTag::parse("en".to_string()).unwrap(), 1.0),
harmful_pp: None,
annotation: None,
quality_warnings: None,
categories: None,
sentence_identifications: vec![Some(Identification::new(
LanguageTag::parse("en".to_string()).unwrap(),
Expand Down

0 comments on commit c93b0ff

Please sign in to comment.