Skip to content

Commit

Permalink
Using spaCy nlp.pipe now processes texts sentence-wise, just like f…
Browse files Browse the repository at this point in the history
…or `nlp(...)`. (#41)

* `pipe` now does sentence-wise predictions just like __call__

* Update changelog
  • Loading branch information
tomaarsen authored Oct 31, 2023
1 parent 38bee88 commit efbbb68
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Types of changes
### Changed

- Changed the error when an empty sentence is provided to the tokenizer.
- Using spaCy `nlp.pipe` now processes texts sentence-wise, just like for `nlp(...)`.

### Fixed

Expand Down
37 changes: 30 additions & 7 deletions span_marker/spacy_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,22 +126,45 @@ def pipe(self, stream, batch_size=128):
stream = self.nlp.pipe(stream, batch_size=batch_size)

for docs in minibatch(stream, size=batch_size):
inputs = [[token.text if not token.is_space else "" for token in doc] for doc in docs]
inputs = [
[[token.text if not token.is_space else "" for token in sent] for sent in doc.sents] for doc in docs
]
tokens = [tokens for sentences in inputs for tokens in sentences]
document_id = [idx for idx, sentences in enumerate(inputs) for _ in sentences]
sentence_id = [idx for sentences in inputs for idx in range(len(sentences))]

# use document-level context in the inference if the model was also trained that way
if self.model.config.trained_with_document_context:
inputs = self.convert_inputs_to_dataset(inputs)
inputs = Dataset.from_dict(
{
"tokens": tokens,
"document_id": document_id,
"sentence_id": sentence_id,
}
)
else:
inputs = tokens

entities_list = self.model.predict(inputs, batch_size=self.batch_size)
for doc, entities in zip(docs, entities_list):

ents_list = []
for idx, entities in enumerate(entities_list):
doc_id = document_id[idx]
num_prior_sentences = sentence_id[idx]
offset = len(sum(tokens[idx - num_prior_sentences : idx], start=[]))
ents = []
for entity in entities:
start = entity["word_start_index"]
end = entity["word_end_index"]
span = doc[start:end]
start = entity["word_start_index"] + offset
end = entity["word_end_index"] + offset
span = docs[doc_id][start:end]
span.label_ = entity["label"]
ents.append(span)
if doc_id == len(ents_list):
ents_list.append(ents)
else:
ents_list[-1].extend(ents)

for doc, ents in zip(docs, ents_list):
self.set_ents(doc, ents)

yield doc
yield from docs

0 comments on commit efbbb68

Please sign in to comment.