Skip to content
This repository has been archived by the owner on Mar 25, 2024. It is now read-only.

Commit

Permalink
can now generate MathML reports over DLMF's .html5 files
Browse files Browse the repository at this point in the history
  • Loading branch information
dginev committed Mar 21, 2019
1 parent e625c69 commit 01dd217
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 9 deletions.
44 changes: 36 additions & 8 deletions examples/corpus_mathml_stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,17 @@

//! Given a `CorTeX` corpus of HTML5 documents, extract statistics on MathML use
//! as per https://github.com/mathml-refresh/mathml/issues/55#issuecomment-474768228
//!
//! example use for arXMLiv:
//! `cargo run --release --example corpus_mathml_stats /data/datasets/dataset-arXMLiv-08-2018 arxmliv_mathml_statistics.txt`
//! example use for DLMF:
//! `cargo run --release --example corpus_mathml_stats /var/local/dlmf dlmf_mathml_statistics.txt .html5`
extern crate libxml;
extern crate llamapun;
extern crate time;

use std::collections::HashMap;
use std::collections::{HashSet, HashMap};
use std::env;
use std::fs::File;
use std::io::prelude::*;
Expand All @@ -34,18 +39,40 @@ pub fn main() -> Result<(), Error> {
};
let node_statistics_filepath = match input_args.next() {
Some(path) => path,
None => "mathml_statistics.txt".to_string(),
None => "statistics_mathml.txt".to_string(),
};

let extension_filter = input_args.next();

let node_statistics_file = File::create(node_statistics_filepath)?;
let mut node_statistics_writer = BufWriter::with_capacity(BUFFER_CAPACITY, node_statistics_file);

let mut catalog = HashMap::new();
let mut corpus = Corpus::new(corpus_path);
corpus.extension = extension_filter;

// open-ended attributes, for which we won't add value categories in the statistics
// (we still counjt the attributes)
// some of the questions were interested in the numeric values used, so we best keep those...
let mut open_ended = HashSet::new();
open_ended.insert("id");
open_ended.insert("xref");
open_ended.insert("alttext");
open_ended.insert("href");
// open_ended.insert("width");
// open_ended.insert("height");
open_ended.insert("altimg");
// open_ended.insert("altimg-width");
// open_ended.insert("altimg-height");
// open_ended.insert("altimg-valign");
// open_ended.insert("minsize");
// open_ended.insert("maxsize");
// open_ended.insert("voffset");

for document in corpus.iter() {
// Recursively descend through the math nodes and increment the frequencies of occurrence
for math in document.get_math_nodes() {
dfs_record(&math, &mut catalog);
dfs_record(&math, &open_ended, &mut catalog);
}

// Increment document counter, bokkeep
Expand Down Expand Up @@ -76,7 +103,7 @@ pub fn main() -> Result<(), Error> {
node_statistics_writer.flush()
}

fn dfs_record(node: &Node, catalog: &mut HashMap<String, u64>)
fn dfs_record(node: &Node, open_ended: &HashSet<&str>, catalog: &mut HashMap<String, u64>)
{
if node.is_text_node() {
return; // Skip text nodes.
Expand All @@ -93,8 +120,9 @@ fn dfs_record(node: &Node, catalog: &mut HashMap<String, u64>)
let node_attr_count = catalog.entry(node_attr_key.clone()).or_insert(0);
*node_attr_count += 1;

let attr_values = val.split_whitespace().collect::<Vec<_>>();
if (attr != "id") && (attr != "xref") && (attr!="alttext") && (attr!="href") {
if !open_ended.contains(attr.as_str()) {
let attr_values = val.split_whitespace().collect::<Vec<_>>();
// altimg-* attributes have specific styling info, we don't need to record it here.
for val in attr_values {
let node_attr_val_key = format!("{}[{}]", node_attr_key, val);
let node_attr_val_count = catalog.entry(node_attr_val_key).or_insert(0);
Expand All @@ -105,11 +133,11 @@ fn dfs_record(node: &Node, catalog: &mut HashMap<String, u64>)

// Recurse into all children (DFS)
if let Some(child) = node.get_first_child() {
dfs_record(&child, catalog);
dfs_record(&child, open_ended, catalog);
let mut child_node = child;

while let Some(child) = child_node.get_next_sibling() {
dfs_record(&child, catalog);
dfs_record(&child, open_ended, catalog);
child_node = child;
}
}
Expand Down
11 changes: 10 additions & 1 deletion src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ pub struct Corpus {
pub senna_options: Cell<SennaParseOptions>,
/// Default setting for `DNM` generation
pub dnm_parameters: DNMParameters,
/// Extension of corpus files (for specially tailored resources such as DLMF's .html5)
/// defaults to selecting .html AND .xhtml files
pub extension : Option<String>
}

/// File-system iterator yielding individual documents
Expand Down Expand Up @@ -132,7 +135,12 @@ impl<'iter> Iterator for DocumentIterator<'iter> {
break;
} else if let Some(Ok(ref entry)) = next_entry {
let file_name = entry.file_name().to_str().unwrap_or("").to_owned();
if file_name.ends_with(".html") || file_name.ends_with(".xhtml") {
let selected = if let Some(ref extension) = self.corpus.extension {
file_name.ends_with(extension)
} else {
file_name.ends_with(".html") || file_name.ends_with(".xhtml")
};
if selected {
let path = entry.path().to_str().unwrap_or("").to_owned();
let doc_result = Document::new(path, self.corpus);
return match doc_result {
Expand All @@ -155,6 +163,7 @@ impl<'iter> Iterator for DocumentIterator<'iter> {
impl Default for Corpus {
fn default() -> Corpus {
Corpus {
extension: None,
path: ".".to_string(),
tokenizer: Tokenizer::default(),
xml_parser: Parser::default(),
Expand Down

0 comments on commit 01dd217

Please sign in to comment.