Skip to content

Commit

Permalink
fixes #145, handling japanese full-width characters
Browse files Browse the repository at this point in the history
  • Loading branch information
David Bashford committed Aug 3, 2018
1 parent 17cb729 commit 72eb3be
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 2 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,9 @@ textract.fromUrl(url, config, function( error, text ) {})
## Release Notes

### 2.3.1 (pending)
* [#164](https://github.com/dbashford/textract/issues/164). Fixed issue with extra text nodes in odt/ott extraction.
* [#149](https://github.com/dbashford/textract/issues/149). RTF extraction error error fixed by [#166](https://github.com/dbashford/textract/pull/166).
* [#145](https://github.com/dbashford/textract/issues/145). Handling Japanese full-width characters.

### 2.3.0
* [#149](https://github.com/dbashford/textract/issues/149). Fixed a few text errors that had cropped up with previous PRs/library updates
Expand Down
4 changes: 2 additions & 2 deletions lib/extract.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ var fs = require( 'fs' )
, totalExtractors = 0
, satisfiedExtractors = 0
, hasInitialized = false
, WHITELIST_PRESERVE_LINEBREAKS = /[^A-Za-z\x80-\xFF\x24\u20AC\xA3\xA5 0-9 \u2015\u2116\u2018\u2019\u201C|\u201D\u2026 \uFF0C \u2013 \u2014 \u00C0-\u1FFF \u2C00-\uD7FF \.,\?""!@#\$%\^&\*\(\)-_=\+;:<>\/\\\|\}\{\[\]`~'-\w\n\r]*/g // eslint-disable-line max-len
, WHITELIST_STRIP_LINEBREAKS = /[^A-Za-z\x80-\xFF\x24\u20AC\xA3\xA5 0-9 \u2015\u2116\u2018\u2019\u201C|\u201D\u2026 \uFF0C \u2013 \u2014 \u00C0-\u1FFF \u2C00-\uD7FF \.,\?""!@#\$%\^&\*\(\)-_=\+;:<>\/\\\|\}\{\[\]`~'-\w]*/g // eslint-disable-line max-len
, WHITELIST_PRESERVE_LINEBREAKS = /[^A-Za-z\x80-\xFF\x24\u20AC\xA3\xA5 0-9 \u2015\u2116\u2018\u2019\u201C|\u201D\u2026 \uFF0C \u2013 \u2014 \u00C0-\u1FFF \u2C00-\uD7FF \uFF01-\uFFE6 \.,\?""!@#\$%\^&\*\(\)-_=\+;:<>\/\\\|\}\{\[\]`~'-\w\n\r]*/g // eslint-disable-line max-len
, WHITELIST_STRIP_LINEBREAKS = /[^A-Za-z\x80-\xFF\x24\u20AC\xA3\xA5 0-9 \u2015\u2116\u2018\u2019\u201C|\u201D\u2026 \uFF0C \u2013 \u2014 \u00C0-\u1FFF \u2C00-\uD7FF \uFF01-\uFFE6 \.,\?""!@#\$%\^&\*\(\)-_=\+;:<>\/\\\|\}\{\[\]`~'-\w]*/g // eslint-disable-line max-len
;

function registerExtractor( extractor ) {
Expand Down
10 changes: 10 additions & 0 deletions test/extract_test.js
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,16 @@ describe( 'textract', function() {
});
});

it( 'can handle manage PDFS with full-width Japanese characters', function( done ) {
var filePath = path.join( __dirname, 'files', 'full-width-j.pdf' );
fromFileWithPath( filePath, function( error, text ) {
expect( error ).to.be.null;
expect( text ).to.be.a( 'string' );
expect( text.replace( / /g, '' ).substring( 2685, 2900 ) ).to.eql( '$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~⦅⦆。「」、・ヲァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン゙゚ᄀᄁᆪᄂᆬᆭᄃᄄᄅᆰᆱᆲᆳᆴᆵᄚᄆᄇᄈᄡᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵ¢£¬ ̄¦¥₩F' );
done();
});
});

// it( 'can handle arabic', function( done ) {
// var filePath = path.join( __dirname, 'files', 'arabic.pdf' );
// fromFileWithPath( filePath, { preserveLineBreaks: true }, function( error, text ) {
Expand Down
Binary file added test/files/full-width-j.pdf
Binary file not shown.

0 comments on commit 72eb3be

Please sign in to comment.