-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummarize.py
55 lines (42 loc) · 1.7 KB
/
summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from time import time
from os import path
import re
from bs4 import BeautifulSoup
# There are two models that we tested
# 1. "facebook/bart-large-cnn"
# 2. "philschmid/bart-large-cnn-samsum"
model_fn, text_fn = "facebook/bart-large-cnn", "./npocrawler/site_sarsef_org/text-.txt"
local_model_fn = "./hf_models/" + model_fn.split("/")[1]
s = time()
if not path.exists(local_model_fn):
print(f"{model_fn} not saved locally, downloading...")
# Trigger the download by doing this
summarizer = pipeline("summarization", model=model_fn)
summarizer.save_pretrained(local_model_fn)
tokenizer = AutoTokenizer.from_pretrained(local_model_fn)
model = AutoModelForSeq2SeqLM.from_pretrained(local_model_fn)
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
print(f"Model located in {time() - s}")
with open(text_fn, "r") as f:
text = f.read().replace("\n", " ").replace("\t", " ")
# Remove all of the lines with only spaces
text = re.sub("\s\s+", " ", text)
print("-----------------")
print(text)
print("-----------------")
# Chunk everything into num_words so that model can summarize that text
num_words = 512
# Break everything into words
text_array = text.split(" ")
# Break the text into parts for the summarization
parts = []
for i in range(0, len(text_array), num_words):
parts.append(" ".join(text_array[i:(i+num_words)]))
print(f"- Original text was {len(text)} characters and {len(text_array)} words")
print(f"- Broke text into {len(parts)} batches to support summarization")
for p in parts:
summary = summarizer(p)
print(f"Summary: {summary[0]['summary_text']}")
print("")