Skip to content

Commit

Permalink
ucd-parse: add support for parsing files under 'extracted/'
Browse files Browse the repository at this point in the history
Closes #46
  • Loading branch information
inquisitivecrystal authored and BurntSushi committed Jul 5, 2022
1 parent 865cf3e commit c61ae95
Show file tree
Hide file tree
Showing 14 changed files with 828 additions and 0 deletions.
61 changes: 61 additions & 0 deletions ucd-parse/src/extracted/derived_bidi_class.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
use std::path::Path;
use std::str::FromStr;

use crate::common::{
parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
UcdFileByCodepoint,
};
use crate::error::Error;

/// A single row in the `extracted/DerivedBidiClass.txt` file.
///
/// This file gives the derived values of the Bidi_Class property.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct DerivedBidiClass {
/// The codepoint or codepoint range for this entry.
pub codepoints: Codepoints,
/// The derived Bidi_Class of the codepoints in this entry.
pub bidi_class: String,
}

impl UcdFile for DerivedBidiClass {
fn relative_file_path() -> &'static Path {
Path::new("extracted/DerivedBidiClass.txt")
}
}

impl UcdFileByCodepoint for DerivedBidiClass {
fn codepoints(&self) -> CodepointIter {
self.codepoints.into_iter()
}
}

impl FromStr for DerivedBidiClass {
type Err = Error;

fn from_str(line: &str) -> Result<DerivedBidiClass, Error> {
let (codepoints, bidi_class) = parse_codepoint_association(line)?;
Ok(DerivedBidiClass { codepoints, bidi_class: bidi_class.to_string() })
}
}

#[cfg(test)]
mod tests {
use super::DerivedBidiClass;

#[test]
fn parse_single() {
let line = "00B5 ; L # L& MICRO SIGN\n";
let row: DerivedBidiClass = line.parse().unwrap();
assert_eq!(row.codepoints, 0x00B5);
assert_eq!(row.bidi_class, "L");
}

#[test]
fn parse_range() {
let line = "0030..0039 ; EN # Nd [10] DIGIT ZERO..DIGIT NINE\n";
let row: DerivedBidiClass = line.parse().unwrap();
assert_eq!(row.codepoints, (0x0030, 0x0039));
assert_eq!(row.bidi_class, "EN");
}
}
66 changes: 66 additions & 0 deletions ucd-parse/src/extracted/derived_binary_properties.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
use std::path::Path;
use std::str::FromStr;

use crate::common::{
parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
UcdFileByCodepoint,
};
use crate::error::Error;

/// A single row in the `extracted/DerivedBinaryProperties.txt` file.
///
/// This file indicates whether a codepoint has the Bidi_Mirrored property.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct DerivedBinaryProperties {
/// The codepoint or codepoint range for this entry.
pub codepoints: Codepoints,
/// The derived property of the codepoints in this entry. Currently,
/// this is always the always the string "Bidi_Mirrored".
pub property: String,
}

impl UcdFile for DerivedBinaryProperties {
fn relative_file_path() -> &'static Path {
Path::new("extracted/DerivedBinaryProperties.txt")
}
}

impl UcdFileByCodepoint for DerivedBinaryProperties {
fn codepoints(&self) -> CodepointIter {
self.codepoints.into_iter()
}
}

impl FromStr for DerivedBinaryProperties {
type Err = Error;

fn from_str(line: &str) -> Result<DerivedBinaryProperties, Error> {
let (codepoints, property) = parse_codepoint_association(line)?;
Ok(DerivedBinaryProperties {
codepoints,
property: property.to_string(),
})
}
}

#[cfg(test)]
mod tests {
use super::DerivedBinaryProperties;

#[test]
fn parse_single() {
let line =
"0028 ; Bidi_Mirrored # Ps LEFT PARENTHESIS\n";
let row: DerivedBinaryProperties = line.parse().unwrap();
assert_eq!(row.codepoints, 0x0028);
assert_eq!(row.property, "Bidi_Mirrored");
}

#[test]
fn parse_range() {
let line = "2A3C..2A3E ; Bidi_Mirrored # Sm [3] INTERIOR PRODUCT..Z NOTATION RELATIONAL COMPOSITION\n";
let row: DerivedBinaryProperties = line.parse().unwrap();
assert_eq!(row.codepoints, (0x2A3C, 0x2A3E));
assert_eq!(row.property, "Bidi_Mirrored");
}
}
65 changes: 65 additions & 0 deletions ucd-parse/src/extracted/derived_combining_class.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
use std::path::Path;
use std::str::FromStr;

use crate::common::{
parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
UcdFileByCodepoint,
};
use crate::error::Error;

/// A single row in the `extracted/DerivedCombiningClass.txt` file.
///
/// This file gives the derived values of the Canonical_Combining_Class
/// property.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct DerivedCombiningClass {
/// The codepoint or codepoint range for this entry.
pub codepoints: Codepoints,
/// The derived Canonical_Combining_Class of the codepoints in this entry.
pub combining_class: String,
}

impl UcdFile for DerivedCombiningClass {
fn relative_file_path() -> &'static Path {
Path::new("extracted/DerivedCombiningClass.txt")
}
}

impl UcdFileByCodepoint for DerivedCombiningClass {
fn codepoints(&self) -> CodepointIter {
self.codepoints.into_iter()
}
}

impl FromStr for DerivedCombiningClass {
type Err = Error;

fn from_str(line: &str) -> Result<DerivedCombiningClass, Error> {
let (codepoints, combining_class) = parse_codepoint_association(line)?;
Ok(DerivedCombiningClass {
codepoints,
combining_class: combining_class.to_string(),
})
}
}

#[cfg(test)]
mod tests {
use super::DerivedCombiningClass;

#[test]
fn parse_single() {
let line = "0020 ; 0 # Zs SPACE\n";
let row: DerivedCombiningClass = line.parse().unwrap();
assert_eq!(row.codepoints, 0x0020);
assert_eq!(row.combining_class, "0");
}

#[test]
fn parse_range() {
let line = "1DD1..1DF5 ; 230 # Mn [37] COMBINING UR ABOVE..COMBINING UP TACK ABOVE\n";
let row: DerivedCombiningClass = line.parse().unwrap();
assert_eq!(row.codepoints, (0x1DD1, 0x1DF5));
assert_eq!(row.combining_class, "230");
}
}
66 changes: 66 additions & 0 deletions ucd-parse/src/extracted/derived_decomposition_type.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
use std::path::Path;
use std::str::FromStr;

use crate::common::{
parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
UcdFileByCodepoint,
};
use crate::error::Error;

/// A single row in the `extracted/DerivedCombiningClass.txt` file.
///
/// This file gives the derived values of the Decomposition_Type
/// property.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct DerivedDecompositionType {
/// The codepoint or codepoint range for this entry.
pub codepoints: Codepoints,
/// The derived Decomposition_Type of the codepoints in this entry.
pub decomposition_type: String,
}

impl UcdFile for DerivedDecompositionType {
fn relative_file_path() -> &'static Path {
Path::new("extracted/DerivedDecompositionType.txt")
}
}

impl UcdFileByCodepoint for DerivedDecompositionType {
fn codepoints(&self) -> CodepointIter {
self.codepoints.into_iter()
}
}

impl FromStr for DerivedDecompositionType {
type Err = Error;

fn from_str(line: &str) -> Result<DerivedDecompositionType, Error> {
let (codepoints, decomposition_type) =
parse_codepoint_association(line)?;
Ok(DerivedDecompositionType {
codepoints,
decomposition_type: decomposition_type.to_string(),
})
}
}

#[cfg(test)]
mod tests {
use super::DerivedDecompositionType;

#[test]
fn parse_single() {
let line = "00A0 ; Nobreak # Zs NO-BREAK SPACE\n";
let row: DerivedDecompositionType = line.parse().unwrap();
assert_eq!(row.codepoints, 0x00A0);
assert_eq!(row.decomposition_type, "Nobreak");
}

#[test]
fn parse_range() {
let line = "3070..3071 ; Canonical # Lo [2] HIRAGANA LETTER BA..HIRAGANA LETTER PA\n";
let row: DerivedDecompositionType = line.parse().unwrap();
assert_eq!(row.codepoints, (0x3070, 0x3071));
assert_eq!(row.decomposition_type, "Canonical");
}
}
66 changes: 66 additions & 0 deletions ucd-parse/src/extracted/derived_east_asian_width.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
use std::path::Path;
use std::str::FromStr;

use crate::common::{
parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
UcdFileByCodepoint,
};
use crate::error::Error;

/// A single row in the `extracted/DerivedEastAsianWidth.txt` file.
///
/// This file gives the derived values of the East_Asian_Width
/// property.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct DerivedEastAsianWidth {
/// The codepoint or codepoint range for this entry.
pub codepoints: Codepoints,
/// The derived East_Asian_Width of the codepoints in this entry.
pub east_asian_width: String,
}

impl UcdFile for DerivedEastAsianWidth {
fn relative_file_path() -> &'static Path {
Path::new("extracted/DerivedEastAsianWidth.txt")
}
}

impl UcdFileByCodepoint for DerivedEastAsianWidth {
fn codepoints(&self) -> CodepointIter {
self.codepoints.into_iter()
}
}

impl FromStr for DerivedEastAsianWidth {
type Err = Error;

fn from_str(line: &str) -> Result<DerivedEastAsianWidth, Error> {
let (codepoints, east_asian_width) =
parse_codepoint_association(line)?;
Ok(DerivedEastAsianWidth {
codepoints,
east_asian_width: east_asian_width.to_string(),
})
}
}

#[cfg(test)]
mod tests {
use super::DerivedEastAsianWidth;

#[test]
fn parse_single() {
let line = "00A0 ; N # Zs NO-BREAK SPACE\n";
let row: DerivedEastAsianWidth = line.parse().unwrap();
assert_eq!(row.codepoints, 0x00A0);
assert_eq!(row.east_asian_width, "N");
}

#[test]
fn parse_range() {
let line = "FF10..FF19 ; F # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE\n";
let row: DerivedEastAsianWidth = line.parse().unwrap();
assert_eq!(row.codepoints, (0xFF10, 0xFF19));
assert_eq!(row.east_asian_width, "F");
}
}
65 changes: 65 additions & 0 deletions ucd-parse/src/extracted/derived_general_category.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
use std::path::Path;
use std::str::FromStr;

use crate::common::{
parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
UcdFileByCodepoint,
};
use crate::error::Error;

/// A single row in the `extracted/DerivedGeneralCategory.txt` file.
///
/// This file gives the derived values of the General_Category property.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct DerivedGeneralCategory {
/// The codepoint or codepoint range for this entry.
pub codepoints: Codepoints,
/// The derived General_Category of the codepoints in this entry.
pub general_category: String,
}

impl UcdFile for DerivedGeneralCategory {
fn relative_file_path() -> &'static Path {
Path::new("extracted/DerivedGeneralCategory.txt")
}
}

impl UcdFileByCodepoint for DerivedGeneralCategory {
fn codepoints(&self) -> CodepointIter {
self.codepoints.into_iter()
}
}

impl FromStr for DerivedGeneralCategory {
type Err = Error;

fn from_str(line: &str) -> Result<DerivedGeneralCategory, Error> {
let (codepoints, general_category) =
parse_codepoint_association(line)?;
Ok(DerivedGeneralCategory {
codepoints,
general_category: general_category.to_string(),
})
}
}

#[cfg(test)]
mod tests {
use super::DerivedGeneralCategory;

#[test]
fn parse_single() {
let line = "04D9 ; Ll # CYRILLIC SMALL LETTER SCHWA\n";
let row: DerivedGeneralCategory = line.parse().unwrap();
assert_eq!(row.codepoints, 0x04D9);
assert_eq!(row.general_category, "Ll");
}

#[test]
fn parse_range() {
let line = "0660..0669 ; Nd # [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE";
let row: DerivedGeneralCategory = line.parse().unwrap();
assert_eq!(row.codepoints, (0x0660, 0x0669));
assert_eq!(row.general_category, "Nd");
}
}
Loading

0 comments on commit c61ae95

Please sign in to comment.