-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfblogger.py
81 lines (69 loc) · 3.07 KB
/
fblogger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
import facebook
import re
import pymongo
from pymongo import Connection
import os
import unicodedata
import urllib
import lxml.html
from xml.dom import minidom
token = "your token"
def extract(comments):
con = Connection('mongodb://pnhegde:[email protected]:10017/music')
collection = con['music']['archive']
for comment in comments:
logger = {}
if isinstance(comment["message"], unicode):
comment["message"] = unicodedata.normalize('NFKD', comment["message"]).encode('ascii','ignore')
if "http://" in comment["message"] or "https://" in comment["message"]:
url = re.search("(?P<url>https?://[^\s]+)", comment["message"]).group("url")
logger["name"] = comment["from"]["name"]
logger["url"] = url
logger["urlID"] = comment["id"]
if logger:
try:
cursor = collection.find({'urlID': str(comment["id"])})
if cursor.count() != 0:
print "already exist"
continue
if "youtu" in logger['url'] :
vid = re.search(r"(youtube|youtu)\.(com|be)/(.*v=([^&]*)|([^&]*))", logger['url'])
videoId = ""
if vid.group(1) == "youtube" :
videoId = vid.group(4)
else:
videoId = vid.group(5)
print videoId
if videoId:
xmldoc = minidom.parse(urllib.urlopen('http://gdata.youtube.com/feeds/api/videos/'+videoId+'?v=2'))
title = xmldoc.getElementsByTagName('media:title')
cat = xmldoc.getElementsByTagName('media:category')
des = xmldoc.getElementsByTagName('media:description')
thumbnail = xmldoc.getElementsByTagName('media:thumbnail')[0]
else:
print "Error finding video ID"
continue
if title:
try:
logger['title'] = title[0].firstChild.nodeValue
except Exception, e:
continue
if cat:
logger['category'] = cat[0].getAttribute('label')
if des:
logger['description'] = des[0].firstChild.nodeValue
if thumbnail:
logger['thumbnail'] = str(thumbnail.getAttribute('url'))
else:
logger["title"] = lxml.html.parse(logger['url']).find(".//title").text
collection.insert(logger, safe=True)
except Exception, e:
continue
def fetch():
graph = facebook.GraphAPI(token)
comments = graph.get_object("397729806942347/comments", limit=1000)
data = comments["data"]
extract(data)
if __name__ == '__main__':
fetch()