forked from gunnarleffler/get_usbr_webdata
-
Notifications
You must be signed in to change notification settings - Fork 4
/
filterA
97 lines (89 loc) · 2.64 KB
/
filterA
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#! /usr/bin/env python
helpstr='''
==============================================================================
filterA
v1.1
Gunnar Leffler
5 September 2012
This script ignores single line SHEF messages it has seen in the recent past.
The history is located in the directory the script is called from. Default
history size is 10000 lines.
usage:
filterA <filename|history size> <filename>
examples:
filterA - takes input from STDIN and returns output to STDOUT
filterA 20000 - same as above, limits dictionary to 20000 lines
filterA 20000 in.shf - same as above, reads input from in.shf
==============================================================================
'''
import sys,os
#Global variables
shefHistory = []
shefDictionary = {}
historySize = 10000
historyPath = "SHEF.history"
def readHistory (path):
output = []
if os.path.exists(path):
theFile = open(path, "r")
for s in theFile:
output.append(s.strip())
theFile.close()
return output
def writeHistory (path, lines):
theFile = open(path, "w")
count = 0
startElement = len(shefHistory) - historySize
for s in lines:
count += 1
if count > startElement:
theFile.write(s+"\n")
theFile.close()
def index(seq, f):
retval = -1
for i in xrange(len(seq)):
if f == seq[i]:
retval = i
break
return retval
def findNewMessages (path):
if path == None:
theFile = sys.stdin
else:
theFile = open(path, "r")
lines = theFile.readlines()
output = []
for s in lines:
st = s.strip()
# if index(shefHistory,st) == -1:
if not st in shefDictionary:
shefHistory.append(st)
shefDictionary[st]=0
output.append(st)
return output
#=============================================
#This is the "entrypoint" for the script
#=============================================
shefHistory = readHistory (historyPath)
shefDictionary = {} #Turns history into dictionary for faster searches
for line in shefHistory:
shefDictionary[line] = 0
thePath = None #The reader function will use STDIN as default
if len(sys.argv) == 1:
thePath = None #if no file is specified, use STDIN
elif len(sys.argv) == 2:
if sys.argv[1].isdigit(): #You can supply history size as the parameter
historySize = int(sys.argv[1])
else:
thePath = sys.argv[1]
else:
if sys.argv[1].isdigit(): #You can supply history size as the parameter
historySize = int(sys.argv[1])
thePath =sys.argv[2]
else:
historySize = int(sys.argv[2])
thePath =sys.argv[1]
response = findNewMessages (thePath)
writeHistory (historyPath,shefHistory)
for s in response:
print s