-
Notifications
You must be signed in to change notification settings - Fork 12
/
scrape_medium_stats.py
184 lines (140 loc) · 5.3 KB
/
scrape_medium_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""The goal is to scrape baseline stats on stories from https://medium.com/me/stats
in order to get a better understanding of how readers engage with a writers work.
At the moment, this script will only be functional if you have set up your
Medium login through Google, though Facebook, Twitter, etc. would be similar
requiring only a few tweaks on your part. The output will be produced in your
curent directory as file called mystats.csv. It's also worth noting that this is
a personal project and is in no way associated with Medium."""
# Imports
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
# Insert Google login info for Medium entry
USER = ''
PASS = ''
# Goes through splash and login process
def splash_process(driver, email, password):
# Goes to sign in page
driver.get('https://medium.com/m/signin')
# Clicks sign in button
driver.find_element_by_xpath(".//button[contains(.,'Sign in')]").click()
# Clicks sign in with Google
driver.find_element_by_xpath(".//button[contains(.,'Sign in with Google')]").click()
# Finds email field
email_field = driver.find_element_by_id("identifierId")
# Types in email
email_field.send_keys(email)
# Clicks next button
driver.find_element_by_id("identifierNext").click()
# Wait a sec
time.sleep(1)
# Finds password field
pass_field = driver.find_element_by_name("password")
# Types in password
pass_field.send_keys(password)
# Click next button
driver.find_element_by_id("passwordNext").click()
# Wait a sec
time.sleep(3)
# Go to stats page and return it
driver.get('https://medium.com/me/stats')
# Scrolls to bottom to get all posts into view
def scroll(driver):
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(6)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
def get_info(driver):
# Grab the main table html from Medium stats
table = driver.find_element_by_class_name('js-statsTableBody')
# Get the raw html from our table element
raw_html = table.get_attribute('innerHTML')
# Quit our driver
driver.quit()
# Clean html
soup = BeautifulSoup(raw_html, 'html.parser')
# Story titles
titles = [item.text for i, item in enumerate(soup.select('h2'))]
#print('---------------------')
#print('Title:', titles[0])
# Reading times
read_times = [item.get('title') for i, item in enumerate(soup.findAll('span',
{'class':'readingTime'}))]
#print('Read Time:', read_times[0])
# Publication names
pubs = []
h2tags = soup.find_all('h2')
for h2tag in h2tags:
page = [str(h2tag)]
elem = h2tag.next_sibling
while elem and elem.name != 'h2':
if elem.text.split('View story')[0] == '':
pubs.append('None')
else:
pubs.append(elem.text.split('View story')[0][3::])
elem = elem.next_sibling
#print('Publication:', pubs[0])
# Get all numerical metrics
nums = [item.text for i, item in enumerate(soup.findAll('span',
{'class':'sortableTable-value'})) if (len(item.text) < 13 or '.' in item.text)]
# Views
views = nums[::4]
#print('Views:', views[0])
# Reads
reads = nums[1::4]
#print('Reads:', reads[0])
# Read ratio
ratio = nums[2::4]
#print('Read Ratio:', ratio[0])
# Fans
fans = nums[3::4]
#print('Fans:', fans[0])
# Create dataframe
df = pd.DataFrame(data={'Title': titles, 'Read Time': read_times, 'Publication': pubs, 'Views': views,
'Reads': reads, 'Read Ratio': ratio, 'Fans': fans})
# Reorder columns
df = df[['Title', 'Publication', 'Read Time', 'Views', 'Reads', 'Read Ratio', 'Fans']]
# Convert numerical features to floats
df = df.apply(pd.to_numeric, errors='ignore')
df['Read Time'] = df['Read Time'].apply(lambda x: int(x.split()[0]))
#print('---------------------')
# Return DataFrame
return df
# Print results in terminal
def print_results(df):
for index, row in df.iterrows():
if index == 0:
print('---------------------')
print('Title:', row['Title'])
print('Read Time:', row['Read Time'])
print('Publication:', row['Publication'])
print('Views:', row['Views'])
print('Reads:', row['Reads'])
print('Read Ratio:', row['Read Ratio'])
print('Fans:', row['Fans'])
print('---------------------')
time.sleep(.2)
# Run main
if __name__ == "__main__":
# Start the driver
driver = webdriver.Chrome('../chromedriver')
# Log in!
splash_process(driver, USER, PASS)
scroll(driver)
# Export as csv
df = get_info(driver)
print_results(df)
df.to_csv('mystats.csv', index=False)
print('Created mystats.csv')