Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unicode sentence boundaries #24

Merged
merged 4 commits into from
May 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ license = "MIT/Apache-2.0"
keywords = ["text", "unicode", "grapheme", "word", "boundary"]
readme = "README.md"
description = """
This crate provides Grapheme Cluster and Word boundaries
This crate provides Grapheme Cluster, Word and Sentence boundaries
according to Unicode Standard Annex #29 rules.
"""

Expand Down
7 changes: 7 additions & 0 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,3 +351,10 @@ def emit_break_module(f, break_table, break_cats, name):
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
word_table.sort(key=lambda w: w[0])
emit_break_module(rf, word_table, word_cats.keys(), "word")

sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
sentence_table = []
for cat in sentence_cats:
sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
sentence_table.sort(key=lambda w: w[0])
emit_break_module(rf, sentence_table, sentence_cats.keys(), "sentence")
15 changes: 15 additions & 0 deletions scripts/unicode_gen_breaktests.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,23 @@ def create_words_data(f):
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)

def create_sentence_data(f):
d = load_test_data("auxiliary/SentenceBreakTest.txt")

test = []

for (c, i) in d:
allchars = [cn for s in c for cn in s]
test.append((allchars, c))

wtype = "&'static [(&'static str, &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/SentenceBreakTest.txt\n")
unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)

if __name__ == "__main__":
with open("testdata.rs", "w") as rf:
rf.write(unicode.preamble)
create_grapheme_data(rf)
create_words_data(rf)
create_sentence_data(rf)
40 changes: 39 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! Iterators which split strings on Grapheme Cluster or Word boundaries, according
//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
//!
//! ```rust
Expand Down Expand Up @@ -67,10 +67,12 @@ pub use grapheme::{Graphemes, GraphemeIndices};
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
pub use tables::UNICODE_VERSION;
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};

mod grapheme;
mod tables;
mod word;
mod sentence;

#[cfg(test)]
mod test;
Expand Down Expand Up @@ -174,6 +176,27 @@ pub trait UnicodeSegmentation {
/// assert_eq!(&swi1[..], b);
/// ```
fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;

/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
///
/// The concatenation of the substrings returned by this function is just the original string.
fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;

/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
///
/// Here, "sentences" are just those substrings which, after splitting on
tomcumming marked this conversation as resolved.
Show resolved Hide resolved
/// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
/// substring must contain at least one character with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
tomcumming marked this conversation as resolved.
Show resolved Hide resolved

/// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
/// and their offsets. See `split_sentence_bounds()` for more information.
fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
}

impl UnicodeSegmentation for str {
Expand Down Expand Up @@ -201,4 +224,19 @@ impl UnicodeSegmentation for str {
fn split_word_bound_indices(&self) -> UWordBoundIndices {
word::new_word_bound_indices(self)
}

#[inline]
fn unicode_sentences(&self) -> UnicodeSentences {
sentence::new_unicode_sentences(self)
}

#[inline]
fn split_sentence_bounds(&self) -> USentenceBounds {
sentence::new_sentence_bounds(self)
}

#[inline]
fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
sentence::new_sentence_bound_indices(self)
}
}
Loading