Uploaded scripts and raw data

crowell · Apr 17, 2012 · 52979e5 · 52979e5
1 parent e84d508
commit 52979e5
Show file tree

Hide file tree

Showing 7 changed files with 425 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,43 @@
+##################
+# Project Specific
+##################
+*.pyc
+backup/
+data/
+
+###################
+# Compiled source #
+###################
+*.com
+*.class
+*.dll
+*.exe
+*.o
+*.so
+
+############
+# Packages #
+############
+# it's better to unpack these files and commit the raw source
+# git has its own built in compression methods
+*.7z
+*.dmg
+*.gz
+*.iso
+*.jar
+*.rar
+*.tar
+*.zip
+
+# Logs and databases #
+######################
+*.log
+*.sql
+*.sqlite
+
+# OS generated files #
+######################
+.DS_Store*
+ehthumbs.db
+Icon?
+Thumbs.db
diff --git a/Makefile b/Makefile
@@ -0,0 +1,2 @@
+clean-data:
+	rm -rf data/*
diff --git a/raw/country.txt b/raw/country.txt
@@ -0,0 +1,70 @@
+AD	Andorra
+AR	Argentina
+AS	American Samoa
+AT	Austria
+AU	Australia
+BD	Bangladesh
+BE	Belgium
+BG	Bulgaria
+BR	Brazil
+CA	Candada
+CH	Switzerland
+CZ	Czech Republic
+DE	Germany
+DK	Denmark
+DO	Dominican Republic
+ES	Spain
+FI	Finland
+FO	Faroe Islands
+FR	France
+GB	Great Britain
+GF	French Guyana
+GG	Guernsey
+GL	Greenland
+GP	Guadeloupe
+GT	Guatemala
+GU	Guam
+GY	Guyana
+HR	Croatia
+HU	Hungary
+IM	Isle of Man
+IN	India
+IS	Iceland
+IT	Italy
+JE	Jersey
+JP	Japan
+LI	Liechtenstein
+LK	Sri Lanka
+LT	Lithuania
+LU	Luxembourg
+MC	Monaco
+MD	Moldavia
+MH	Marshall Islands
+MK	Macedonia
+MP	Northern Mariana Islands
+MQ	Martinique
+MX	Mexico
+MY	Malaysia
+NL	Holland
+NO	Norway
+NZ	New Zealand
+PH	Phillippines
+PK	Pakistan
+PL	Poland
+PM	Saint Pierre and Miquelon
+PR	Puerto Rico
+PT	Portugal
+RE	French Reunion
+RU	Russia
+SE	Sweden
+SI	Slovenia
+SJ	Svalbard & Jan Mayen Islands
+SK	Slovak Republic
+SM	San Marino
+TH	Thailand
+TR	Turkey
+US	United States
+VA	Vatican
+VI	Virgin Islands
+YT	Mayotte
+ZA	South Africa
diff --git a/raw/headers.txt b/raw/headers.txt
@@ -0,0 +1 @@
+country,post code,place name,state,state abbreviation,ignore1,ignore2,ignore3,ignore4,latitude,longitude
diff --git a/scripts/file.py b/scripts/file.py
@@ -0,0 +1,114 @@
+from __future__ import with_statement
+import codecs
+import csv
+import sys
+import json
+import os
+
+from contextlib import closing
+from zipfile import ZipFile, ZIP_DEFLATED
+import os
+
+
+'''
+ZIP Directory Helper Function
+'''
+def zipdir(basedir, archivename):
+    assert os.path.isdir(basedir)
+    with closing(ZipFile(archivename, "w", ZIP_DEFLATED)) as z:
+
+        # traverse directory recursively 
+        for root, dirs, files in os.walk(basedir):
+            #ignores empty directories
+            for fn in files:
+                absfn = os.path.join(root, fn)
+                zfn = absfn[len(basedir)+len(os.sep):] #XXX: relative path
+                z.write(absfn, zfn)
+
+'''
+Picks out all the directories and zips them
+'''
+def make_zip( countries ):
+
+    print "Zipping folders"
+
+    # for all the country codes
+    for cc in countries :
+        # make a name for the file
+        zipname = cc+".zip"
+        directory =  os.path.join(os.getcwd(),cc)
+
+        # ZIP all the folders into one
+        zipdir(directory,zipname)
+
+        # Print 10 to a line
+        count+=1
+        sys.stdout.write(cc+" ")
+        if not count%10 :
+            print ""
+
+    pass   
+
+
+'''
+Made specifically for GEONAMES.ORG postal code data parsing
+'''
+def main():
+
+    if len(sys.argv) <3:
+        print "Usage: "+sys.argv[0]+" <csv-file> <header-file>"
+        sys.exit(-1)
+
+    headerfile = sys.argv[2]
+    csvfile = sys.argv[1]
+
+    # get the headers, COMMA delimited 
+    hfile = csv.reader( open(headerfile, 'rb'), delimiter=',', quotechar='|' )
+    headers = hfile.next()
+
+    # Print list of valid header terms
+    print filter( lambda hh: "ignore" not in hh , headers )
+
+    # Read the TAB delimited file 
+    reader =csv.reader(open(csvfile, 'rb'), delimiter='\t', quotechar='|')
+
+    # Keep track of country changes
+    countries = set()
+
+    print "Generating Files ... "
+
+    for row in reader :
+
+        # If not empty
+        if row[0] != '':
+            cc = row[0].lower()
+            output_dir = os.path.join(os.getcwd(), cc)
+            if not os.path.exists(output_dir): os.makedirs(output_dir);
+
+            # Print if we have moved onto a new country country
+            if cc not in countries:
+                countries.add(cc)
+                sys.stdout.write(cc+ " ")
+                if not len(countries)%10 :
+                    print "" 
+
+            postcode = row[1]
+            index = dict();
+
+            # Populate information 
+            for ii in range(0,len(headers)):
+                if 'ignore' not in headers[ii]:
+                    index[ headers[ii] ] = unicode(row[ii], 'utf-8');
+
+            raw = json.dumps(index,ensure_ascii=False);
+            fout = codecs.open(os.path.join(output_dir,postcode), encoding='utf-8', mode="w+" )
+            fout.write( raw )
+            fout.close()
+
+    make_zip(countries)
+
+
+
+
+main()
+
diff --git a/scripts/france.py b/scripts/france.py
@@ -0,0 +1,66 @@
+from __future__ import with_statement
+import codecs
+import csv
+import sys
+import json
+import os
+
+import os
+
+
+
+
+'''
+Made specifically for GEONAMES.ORG postal code data parsing
+'''
+def main():
+
+    if len(sys.argv) <3:
+        print "Usage: "+sys.argv[0]+" <csv-file> <header-file>"
+        sys.exit(-1)
+
+    headerfile = sys.argv[2]
+    csvfile = sys.argv[1]
+
+    # get the headers, COMMA delimited 
+    hfile = csv.reader( open(headerfile, 'rb'), delimiter=',', quotechar='|' )
+    headers = hfile.next()
+
+    # Print list of valid header terms
+    print filter( lambda hh: "ignore" not in hh , headers )
+
+    # Read the TAB delimited file 
+    reader =csv.reader(open(csvfile, 'rb'), delimiter='\t', quotechar='|')
+
+    # Keep track of country changes
+
+    print "Generating Files for France "
+
+    for row in reader :
+
+        # If only france
+        if row[0].lower() == 'fr':
+            cc = row[0].lower()
+            output_dir = os.path.join(os.getcwd(), cc)
+            if not os.path.exists(output_dir): os.makedirs(output_dir);
+
+
+            postcode = row[1].split(' ')[0]
+            index = dict();
+
+            # Populate information 
+            for ii in range(0,len(headers)):
+                if 'ignore' not in headers[ii]:
+                    index[ headers[ii] ] = unicode(row[ii], 'utf-8');
+
+            raw = json.dumps(index,ensure_ascii=False);
+            fout = codecs.open(os.path.join(output_dir,postcode), encoding='utf-8', mode="w+" )
+            fout.write( raw )
+            fout.close()
+
+
+
+
+
+main()
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		country,post code,place name,state,state abbreviation,ignore1,ignore2,ignore3,ignore4,latitude,longitude