Skip to content

Commit

Permalink
fixes #164, handling duplication with extra line breaks in odt/ott
Browse files Browse the repository at this point in the history
  • Loading branch information
David Bashford committed Aug 3, 2018
1 parent dff3b32 commit 17cb729
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 4 deletions.
3 changes: 3 additions & 0 deletions lib/extractors/odt.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@ function extractText( filePath, options, cb ) {
.replace( /^(.Archive).*/, '' )
.replace( /text:p/g, 'textractTextNode' )
.replace( /text:h/g, 'textractTextNode' )
// remove empty nodes
.replace( /<textractTextNode\/>/g, '' )
// remove empty nodes that have styles
.replace( /<textractTextNode[^>]*\/>/g, '' )
.trim()
, $ = cheerio.load( '<body>' + output + '</body>' )
, nodes = $( 'textractTextNode' )
Expand Down
2 changes: 1 addition & 1 deletion test/buffer_test.js
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ var test = function(_testFunction, withMime) {
_test(
"ott",
"ott.ott",
"This is a document template, yay templates! Woo templates get me so excited! Woo templates get me so"
"This is a document template, yay templates! Woo templates get me so excited!"
);

_test(
Expand Down
16 changes: 14 additions & 2 deletions test/extract_test.js
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,18 @@ describe( 'textract', function() {
});
});

describe( 'for odt files', function() {
it( 'will extract text from ODT files', function( done ) {
var filePath = path.join( __dirname, 'files', 'spaced.odt' );
fromFileWithPath( filePath, function( error, text ) {
expect( error ).to.be.null;
expect( text ).to.be.an( 'string' );
expect( text ).to.eql( 'This Is some text' );
done();
});
});
});

describe( 'for image files', function() {
it( 'will extract text from PNG files', function( done ) {
var filePath = path.join( __dirname, 'files', 'testphoto.png' );
Expand Down Expand Up @@ -675,8 +687,8 @@ describe( 'textract', function() {
test(
'ott',
'ott.ott',
'This is a document template, yay templates! Woo templates get me so excited! Woo templates get me so',
'This is a document template, yay templates!\nWoo templates get me so excited!\nWoo templates get me so'
'This is a document template, yay templates! Woo templates get me so excited!',
'This is a document template, yay templates!\nWoo templates get me so excited!'
);

test(
Expand Down
Binary file added test/files/spaced.odt
Binary file not shown.
2 changes: 1 addition & 1 deletion test/url_test.js
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ describe( 'fromUrl tests', function() {
test(
'ott',
'ott.ott',
'This is a document template, yay templates! Woo templates get me so excited! Woo templates get me so'
'This is a document template, yay templates! Woo templates get me so excited!'
);

test(
Expand Down

0 comments on commit 17cb729

Please sign in to comment.