-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbot.py
158 lines (131 loc) · 5.41 KB
/
bot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import requests
from requests import get
from datetime import date
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import numpy as np
import tweepy
import random
import time
import config
# on the private live repo version of this, the auth tokens are placed here because Heroku
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
# to be used to work out what day it is - and so whether to scrape or not (Lords don't sit fri-sun)
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
today = date.today().isoweekday() - 1
day = days[today]
headers = {"Accept-language": "en-US, en;q=0.5"}
url = "https://hansard.parliament.uk/lords/"
# date stamp to be used for the dynamic urls
time_stamp = datetime.datetime.now()
date_stamp = time_stamp.strftime("%Y-%m-%d")
def scrape():
# creates dynamic url to get the current day's list
results = requests.get(url + date_stamp, headers=headers)
soup = BeautifulSoup(results.text, "html.parser")
# this gets rid of the span texts (item number n) etc
for span_tag in soup.findAll('span'):
span_tag.replace_with('')
# the issues_div is subject to change without any warning
# need a way of knowing when this happens, rather than just noticing the code not running
issues_div = soup.find_all('div', class_='primary-info')
issues_list = []
# this gets rid of the whitespace in the html text
for container in issues_div:
issue = container.text
issues_list.append(issue.strip())
# make the items in the list unique
issues_set = set(issues_list)
# convert back to list
issues = list(issues_set)
# create the big string
string_dirty = ', '.join(sorted(issues))
# abbreviate or shorten the repetitive list items - to be added to
# really trying to keep the final string as short as possible so it doesn't create so many tweets
# this isn't working btw
string = string_dirty.replace('Arrangement of Business', '')
string = string.replace('Regulations 2020', 'Regs 2020')
string = string.replace('Business of the House', 'BoH')
string = string.replace('House of Lords', 'HoL')
string = string.replace('Lord Speaker’s Statement', 'LSS')
string = string.replace('Whole day', '|')
string = string[2:]
intro = day + ", the Lords discussed: "
cont = "Cont'd: "
nada = "Nothing to report so far today."
# this creates a clickable link to the pdf version - dynamically created url
# pdf_link = "https://hansard.parliament.uk/pdf/lords/" + date_stamp
# pdf_tweet = "You can download the Hansard record of the entire day in PDF format here: " + pdf_link
if len(string) < 1:
print(nada)
else:
if len(string) < 242:
api.update_status(intro + string)
# api.update_status(pdf_tweet)
# i cannot maths. this splits the string into two tweets which will be under 280 characters with intro/cont
elif len(string) < 500:
first, second = string[:len(string)//2], string[len(string)//2:]
api.update_status(intro + first)
api.update_status(cont + second)
# api.update_status(pdf_tweet)
# this can all go into a function if it works reliably in January
elif len(string) < 774:
first = string[:242]
second = string[242:508]
third = string[508:774]
api.update_status(intro + first + " (1/3)")
api.update_status(cont + second + " (2/3)")
api.update_status(cont + third + " (3/3)")
# api.update_status(pdf_tweet)
elif len(string) < 1040:
first = string[:242]
second = string[242:508]
third = string[508:774]
fourth = string[774:1040]
api.update_status(intro + first + " (1/4)")
api.update_status(cont + second + " (2/4)")
api.update_status(cont + third + " (3/4)")
api.update_status(cont + fourth + " (4/4)")
# api.update_status(pdf_tweet)
elif len(string) < 1306:
first = string[:242]
second = string[242:508]
third = string[508:774]
fourth = string[774:1040]
fifth = string[1040:1306]
# sometimes they may not be anything to go into the fifth tweet...
# need to make the (1/5) dynamic and smart rather than static
api.update_status(intro + first + " (1/5)")
api.update_status(cont + second + " (2/5)")
api.update_status(cont + third + " (3/5)")
api.update_status(cont + fourth + " (4/5)")
if len(fifth) > 0:
api.update_status(cont + fifth + " (5/5)")
# api.update_status(pdf_tweet)
# if the list of issues has more than 1572 characters this whole program will not work...
# i really don't want to be littering people's timelines with more than 6 tweets though
elif len(string) < 1572:
first = string[:242]
second = string[242:508]
third = string[508:774]
fourth = string[774:1040]
fifth = string[1040:1306]
sixth = string[1306:1572]
api.update_status(intro + first + " (1/6)")
api.update_status(cont + second + " (2/6)")
api.update_status(cont + third + " (3/6)")
api.update_status(cont + fourth + " (4/6)")
api.update_status(cont + fifth + " (5/6)")
if len(sixth) > 0:
api.update_status(cont + sixth + " (6/6)")
# api.update_status(pdf_tweet)
else:
api.update_status(nada)
# only scrape on days the lords are meant to sit
if today < 5:
scrape()
else:
api.update_status("The Lords do not debate on " + day + "s.")