Skip to content

Commit

Permalink
Uploaded scripts and raw data
Browse files Browse the repository at this point in the history
  • Loading branch information
samirahmed committed Apr 17, 2012
1 parent e84d508 commit 52979e5
Show file tree
Hide file tree
Showing 7 changed files with 425 additions and 0 deletions.
43 changes: 43 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
##################
# Project Specific
##################
*.pyc
backup/
data/

###################
# Compiled source #
###################
*.com
*.class
*.dll
*.exe
*.o
*.so

############
# Packages #
############
# it's better to unpack these files and commit the raw source
# git has its own built in compression methods
*.7z
*.dmg
*.gz
*.iso
*.jar
*.rar
*.tar
*.zip

# Logs and databases #
######################
*.log
*.sql
*.sqlite

# OS generated files #
######################
.DS_Store*
ehthumbs.db
Icon?
Thumbs.db
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
clean-data:
rm -rf data/*
70 changes: 70 additions & 0 deletions raw/country.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
AD Andorra
AR Argentina
AS American Samoa
AT Austria
AU Australia
BD Bangladesh
BE Belgium
BG Bulgaria
BR Brazil
CA Candada
CH Switzerland
CZ Czech Republic
DE Germany
DK Denmark
DO Dominican Republic
ES Spain
FI Finland
FO Faroe Islands
FR France
GB Great Britain
GF French Guyana
GG Guernsey
GL Greenland
GP Guadeloupe
GT Guatemala
GU Guam
GY Guyana
HR Croatia
HU Hungary
IM Isle of Man
IN India
IS Iceland
IT Italy
JE Jersey
JP Japan
LI Liechtenstein
LK Sri Lanka
LT Lithuania
LU Luxembourg
MC Monaco
MD Moldavia
MH Marshall Islands
MK Macedonia
MP Northern Mariana Islands
MQ Martinique
MX Mexico
MY Malaysia
NL Holland
NO Norway
NZ New Zealand
PH Phillippines
PK Pakistan
PL Poland
PM Saint Pierre and Miquelon
PR Puerto Rico
PT Portugal
RE French Reunion
RU Russia
SE Sweden
SI Slovenia
SJ Svalbard & Jan Mayen Islands
SK Slovak Republic
SM San Marino
TH Thailand
TR Turkey
US United States
VA Vatican
VI Virgin Islands
YT Mayotte
ZA South Africa
1 change: 1 addition & 0 deletions raw/headers.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
country,post code,place name,state,state abbreviation,ignore1,ignore2,ignore3,ignore4,latitude,longitude
114 changes: 114 additions & 0 deletions scripts/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from __future__ import with_statement
import codecs
import csv
import sys
import json
import os

from contextlib import closing
from zipfile import ZipFile, ZIP_DEFLATED
import os


'''
ZIP Directory Helper Function
'''
def zipdir(basedir, archivename):
assert os.path.isdir(basedir)
with closing(ZipFile(archivename, "w", ZIP_DEFLATED)) as z:

# traverse directory recursively
for root, dirs, files in os.walk(basedir):
#ignores empty directories
for fn in files:
absfn = os.path.join(root, fn)
zfn = absfn[len(basedir)+len(os.sep):] #XXX: relative path
z.write(absfn, zfn)

'''
Picks out all the directories and zips them
'''
def make_zip( countries ):

print "Zipping folders"

# for all the country codes
for cc in countries :
# make a name for the file
zipname = cc+".zip"
directory = os.path.join(os.getcwd(),cc)

# ZIP all the folders into one
zipdir(directory,zipname)

# Print 10 to a line
count+=1
sys.stdout.write(cc+" ")
if not count%10 :
print ""

pass


'''
Made specifically for GEONAMES.ORG postal code data parsing
'''
def main():

if len(sys.argv) <3:
print "Usage: "+sys.argv[0]+" <csv-file> <header-file>"
sys.exit(-1)

headerfile = sys.argv[2]
csvfile = sys.argv[1]

# get the headers, COMMA delimited
hfile = csv.reader( open(headerfile, 'rb'), delimiter=',', quotechar='|' )
headers = hfile.next()

# Print list of valid header terms
print filter( lambda hh: "ignore" not in hh , headers )

# Read the TAB delimited file
reader =csv.reader(open(csvfile, 'rb'), delimiter='\t', quotechar='|')

# Keep track of country changes
countries = set()

print "Generating Files ... "

for row in reader :

# If not empty
if row[0] != '':
cc = row[0].lower()
output_dir = os.path.join(os.getcwd(), cc)
if not os.path.exists(output_dir): os.makedirs(output_dir);

# Print if we have moved onto a new country country
if cc not in countries:
countries.add(cc)
sys.stdout.write(cc+ " ")
if not len(countries)%10 :
print ""

postcode = row[1]
index = dict();

# Populate information
for ii in range(0,len(headers)):
if 'ignore' not in headers[ii]:
index[ headers[ii] ] = unicode(row[ii], 'utf-8');

raw = json.dumps(index,ensure_ascii=False);
fout = codecs.open(os.path.join(output_dir,postcode), encoding='utf-8', mode="w+" )
fout.write( raw )
fout.close()

make_zip(countries)




main()

66 changes: 66 additions & 0 deletions scripts/france.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from __future__ import with_statement
import codecs
import csv
import sys
import json
import os

import os




'''
Made specifically for GEONAMES.ORG postal code data parsing
'''
def main():

if len(sys.argv) <3:
print "Usage: "+sys.argv[0]+" <csv-file> <header-file>"
sys.exit(-1)

headerfile = sys.argv[2]
csvfile = sys.argv[1]

# get the headers, COMMA delimited
hfile = csv.reader( open(headerfile, 'rb'), delimiter=',', quotechar='|' )
headers = hfile.next()

# Print list of valid header terms
print filter( lambda hh: "ignore" not in hh , headers )

# Read the TAB delimited file
reader =csv.reader(open(csvfile, 'rb'), delimiter='\t', quotechar='|')

# Keep track of country changes

print "Generating Files for France "

for row in reader :

# If only france
if row[0].lower() == 'fr':
cc = row[0].lower()
output_dir = os.path.join(os.getcwd(), cc)
if not os.path.exists(output_dir): os.makedirs(output_dir);


postcode = row[1].split(' ')[0]
index = dict();

# Populate information
for ii in range(0,len(headers)):
if 'ignore' not in headers[ii]:
index[ headers[ii] ] = unicode(row[ii], 'utf-8');

raw = json.dumps(index,ensure_ascii=False);
fout = codecs.open(os.path.join(output_dir,postcode), encoding='utf-8', mode="w+" )
fout.write( raw )
fout.close()





main()

Loading

0 comments on commit 52979e5

Please sign in to comment.