This repository has been archived by the owner on Aug 19, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 73
/
get_arxiv_multiprocessing.py
63 lines (53 loc) · 2.19 KB
/
get_arxiv_multiprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import urllib.request
import feedparser
from datetime import datetime
import json
import multiprocessing
from multiprocessing import Pool
import time
import re
def get_article_info(url):
response = urllib.request.urlopen(url)
rss = response.read()
feed = feedparser.parse(rss)
data = []
# 遍历每个文章,输出标题、摘要和作者信息
for entry in feed.entries:
summary = entry.summary.replace('\n', '').replace('</p>', '').replace('<p>', '').replace('\\', '')
summary = re.sub(r'http\S+', '', summary)
title = entry.title.replace('(arXiv:'+ entry.title.split('(arXiv:')[1].split(')')[0] + ')', '').strip()
info = {
"instruction": "If you are an expert in writing papers, please generate a good paper title for this paper based on other authors' descriptions of their abstracts.",
"input": str(summary),
"output": str(title)
}
data.append(info)
return data
if __name__ == '__main__':
# 获取Arxiv每日更新的人工智能、计算机视觉和机器学习分类的文章
url1 = "http://export.arxiv.org/rss/cs.AI" # 人工智能分类RSS源
url2 = "http://export.arxiv.org/rss/cs.CV" # 计算机视觉分类RSS源
url3 = "http://export.arxiv.org/rss/cs.LG" # 机器学习分类RSS源
urls = [url1, url2, url3]
# 数据源
data = []
# 开始计时
start = time.time()
# 获取 CPU 核心数
cores = multiprocessing.cpu_count()
# 将进程数设置为 CPU 核心数 + 1
processes = cores + 1
# 使用多线程获取文章信息
with Pool(processes) as p:
data = p.map(get_article_info, urls)
# 将多个列表合并为一个列表
data = [info for subdata in data for info in subdata]
# 获取当前日期
today = datetime.today().date()
# 保存json
with open('data/'+str(today)+'.json', 'w+') as f:
json.dump(data, f)
# 结束计时并输出用时
end = time.time()
print('共处理了%s篇文章,用时%.2f秒' % (len(data), end-start))
# 共处理了371篇文章,用时4.51秒