-
Notifications
You must be signed in to change notification settings - Fork 2
/
format.py
61 lines (50 loc) · 1.83 KB
/
format.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import json
import os
import csv
def get_details(file):
data = json.load(file)
paper_id = data['paper_id']
title = data['metadata']['title']
# Abstract
abstract_paras = len(data['abstract'])
abstract_text = ""
for i in range(abstract_paras):
abstract_text += data['abstract'][i]['text']
# Authors
number_of_authors = len(data['metadata']['authors'])
authors_list = []
for i in range(number_of_authors):
authors_list.append(data['metadata']['authors'][i]['first'] + ' ' + data['metadata']['authors'][i]['last'])
authors = ', '.join(authors_list)
return paper_id, title, abstract_text, authors
def main():
DATA_DIR_PATH = '' # Path of the dataset set directory
# Read all files in the directory
# Columns
cols = ['paper_id', 'title', 'abstract', 'authors', 'doi']
with open("papers.csv", 'w', newline='', encoding="utf8") as csv_file:
# writing to the csv
csv_writer = csv.writer(csv_file, delimiter=',')
csv_writer.writerow(cols)
count = 0
data = json.load(open("myfile.json", "r"))
# print(data)
for file in data:
print(f'{count}. is being written')
file_path = os.path.join(DATA_DIR_PATH, file['sha'])
# print(file_path)
paper_id, title, abstract, authors = get_details(open(file_path), "r", encoding="utf8")
csv_writer.writerow([paper_id, title, abstract, authors, file['doi']])
count += 1
# main()
def see_metadata():
with open('metadata.csv', 'r', encoding="utf8") as csv_file:
csv_reader = csv.reader(csv_file, delimiter=",")
# csv_reader.readline()
count = 0
for row in csv_reader:
if count > 2: break
print(row)
print(f'Count - {count}')
count += 1
see_metadata()