migrated from JPL github

torresal · Aug 28, 2015 · 06e976a · 06e976a
1 parent d545a28
commit 06e976a
Showing 26 changed files with 296 additions and 3,554 deletions.
diff --git a/Amazon parse html copy.py → Amazon parse html.py b/Amazon parse html copy.py → Amazon parse html.py
diff --git a/DBPEDIA-ERRORS copy.txt → DBPEDIA-ERRORS.txt b/DBPEDIA-ERRORS copy.txt → DBPEDIA-ERRORS.txt
diff --git a/GOOGLE-DATA copy.txt → GOOGLE-DATA.txt b/GOOGLE-DATA copy.txt → GOOGLE-DATA.txt
diff --git a/GOOGLE-ERRORS copy.txt → GOOGLE-ERRORS.txt b/GOOGLE-ERRORS copy.txt → GOOGLE-ERRORS.txt
diff --git a/Gcis.conf b/Gcis.conf
@@ -0,0 +1,4 @@
+---
+- url      : https://gcis-search-stage.jpl.net:3000
+  userinfo : user:key
+  key      : key
diff --git a/Gcis.conf copy b/Gcis.conf copy
diff --git a/ISBNDB-DATA copy.txt → ISBNDB-DATA.txt b/ISBNDB-DATA copy.txt → ISBNDB-DATA.txt
diff --git a/ISBNDB-ERRORS copy.txt → ISBNDB-ERRORS.txt b/ISBNDB-ERRORS copy.txt → ISBNDB-ERRORS.txt
diff --git a/LOC-DATA copy.txt → LOC-DATA.txt b/LOC-DATA copy.txt → LOC-DATA.txt
diff --git a/LOC-ERRORS copy.txt → LOC-ERRORS.txt b/LOC-ERRORS copy.txt → LOC-ERRORS.txt
diff --git a/README.md b/README.md
@@ -1 +1,79 @@
-# gcis-isbn-validation
+# gcis-isbn-validator
+Scripts used to validate and clean up ISBN formats in GCIS-DEV.
+
+What this script does?
+=====================
+*"isbn-validator-post"* is used to **ONLY** extract, clean, and repost onto GCIS-DEV
+
+Details:
+
+    1. Accepts Gcis.conf credentials as command line argument [example: https://metacpan.org/pod/Gcis::Client#CONFIGURATION]
+    2. Parses metadata from GCIS-DEV into a JSON dictionary 
+    3. Extracts the "isbn" key from the dictionary and updates the isbn to a cannonical ISBN 13 character format
+    4. Posts updated JSON dictionary back into GCIS-DEV using credentials from command line arguments.
+
+Other scripts are used to compare metadata. If the metadata from GCIS-DEV matches the metadata from other database, then the cleaned ISBN is considered valid. Each script is dedicated to a specfic database (World Cat, Google, Library of Congress, etc.).
+
+Each of the following scripts:
+
+    1. *Extracts all ISBN formats from GCIS-DEV 
+    2. Converts ISBN to a validated ISBN-13 format 
+    3. Writes metadata from GCIS-DEV and other Database onto text file [example: "{database}-DATA.txt"].
+    4. Writes errors from GCIS-DEV and other Database onto text file [example: "{database}-ERROR.txt"].
+
+Requirements
+============
+1. Python:
+
+  - Python 3.4 
+
+2. API access: (isbn-validator-post use only)
+
+    Positional arg:
+
+      Gcis.conf (YAML format) file containing:
+
+        - url
+        - userinfo
+        - key
+
+    Optional arg: (*use only if Gcis.conf file is not avaliable)
+
+        -username and api key
+
+Installation
+============
+Clone git repo "gcis-isbn-validator".
+
+
+Usage
+=====
+To execute script, open command line and change directory to location of git repo. 
+
+Enter into command line:
+
+    ~python [SCRIPT.py]
+
+*Some scripts may require extra arguments. To view them, insert a "--help" flag at the end of script 
+
+    ~python [SCRIPT.py] --help
+
+Notes/References
+================
+*Scripts that require command line arguments:
+
+        - isbn-validator-post.py
+        - isbndb_isbn_validator.py
+        
+Incomplete Scripts:
+
+        - amazon_isbn_validator.py
+        
+                Issue: Implementing Amazon Request into script and parsing the Amazon metadata
+                        
+        - dbpedia_isbn_valdator.py 
+        
+                Issue: No ISBN results when running ISBN SPARQL query
+        
+
+
diff --git a/README.md copy b/README.md copy
diff --git a/WCAT-DATA copy.txt b/WCAT-DATA copy.txt
diff --git a/WCAT-DATA.txt b/WCAT-DATA.txt
@@ -0,0 +1,3 @@
+DATE:08/14/2015
+MILITARY TIME:10:05:21
+
diff --git a/WCAT-ERRORS copy.txt b/WCAT-ERRORS copy.txt
diff --git a/WCAT-ERRORS.txt b/WCAT-ERRORS.txt
@@ -0,0 +1,2 @@
+DATE:08/14/2015
+MILITARY TIME:10:05:21
diff --git a/amazon_isbn_validator copy.py → amazon_isbn_validator.py b/amazon_isbn_validator copy.py → amazon_isbn_validator.py
@@ -41,7 +41,9 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('-awsid', '--AWSAccessKeyID', help = "Insert AWS Access Key ID")
     parser.add_argument('-astag', '--AssociateTag', help = "Insert Amazon Associate Tag")
+    parser.add_argument('-path', '--GCIS', help = "Insert url path to GCIS book in JSON format [ex.'https://gcis-search-stage.jpl.net:3000/book.json?all=1'] ")
     args = parser.parse_args()
+    GCIS = args.GCIS
 
     if args.AWSAccessKeyID:
         print(args.AWSAccessKeyID)
@@ -53,6 +55,12 @@ def main():
     else:
         print('NO Amazon Associate Tag')
 
+    if GCIS is None:
+        GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1'
+        print('NO MANUAL GCIS PATH\n ALL GCIS BOOK JSON FORMATS WILL BE USED AS DEFAULT')
+
+    GCISPAR = parse(GCIS)
+
 
     for x in range(len(GCISPAR)):
         try:
@@ -89,4 +97,7 @@ def main():
         except:
             Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format(TITLE, ISBNS, IDEN)
             print(Error)
-            file.write(Error)
+            file.write(Error)
+
+if __name__ =='__main__':
+    main()
diff --git a/dbpedia_isbn_validator copy.py b/dbpedia_isbn_validator copy.py
diff --git a/dbpedia_isbn_validator.py b/dbpedia_isbn_validator.py
@@ -0,0 +1,121 @@
+__author__ = 'torresal'
+
+"""Notes for Programmer"""
+"""Helpful links:
+        http://dbpedia.org/sparql
+        http://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=select+distinct+%3Fbook+%3Fisbn%0D%0A++++where+%7B%0D%0A++++++%3Fbook+a+dbo%3ABook+.%0D%0A++++++%3Fbook+%3Fprop+%3Fobj+.%0D%0A++++++%3Fbook+dbp%3Aisbn+%3Fisbn+.%0D%0A++++%7D%0D%0A++++LIMIT+1000&format=text%2Fhtml&timeout=30000&debug=on
+        https://pypi.python.org/pypi/Distance"""
+
+import re,argparse, time
+from isbn_hyphenate import hyphenate
+from isbnlib import EAN13, clean, to_isbn13, meta, canonical, to_isbn10
+from SPARQLWrapper import SPARQLWrapper, JSON
+
+def RQUERY(r):
+    #SPARQL query ISBNs from dbpedia
+    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
+    sparql.setQuery(r)
+    sparql.setReturnFormat(JSON)
+    results = sparql.query().convert()
+    print(r)
+    #print(results)
+    if len(results["results"]["bindings"]) != 0:
+        print(results)
+        pass
+    return results["results"]["bindings"]
+
+QUERY = """
+                select distinct ?book ?prop ?obj
+            where {
+              ?book a dbo:Book .
+              ?book ?prop ?obj .
+              ?book dbp:isbn ?isbn .
+              FILTER (regex(?isbn, "%s" ))
+            }
+            LIMIT 100
+        """
+
+#ERROR file
+file = open("DBPEDIA-ERRORS.txt", "w")
+DATE = ("DATE:" + time.strftime("%m/%d/%Y"))
+TIME = ("MILITARY TIME:" + time.strftime("%H:%M:%S"))
+file.write(DATE+"\n"+TIME+"\n")
+
+
+def parse(url):
+    import requests
+    r = requests.get(url, verify = False)
+    JSONdict = r.json()
+    return JSONdict
+
+def main():
+#Commnd line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-path', '--GCIS', help = "Insert url path to GCIS book in JSON format [ex.'https://gcis-search-stage.jpl.net:3000/book.json?all=1'] ")
+    args = parser.parse_args()
+    GCIS = args.GCIS
+
+    if GCIS is None:
+        GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1'
+        print('NO MANUAL GCIS PATH\n ALL GCIS BOOK JSON FORMATS WILL BE USED AS DEFAULT')
+
+    GCISPAR = parse(GCIS)
+    for x in range(len(GCISPAR)):
+        try:
+        #Extracts book identifier from GCIS#
+            IDEN = GCISPAR[x]["identifier"]
+            match =  re.search(r'.*/(.*?)\..*?$', GCIS)
+            if match:
+                FILETYPE = match.groups()[0]
+        #HREF = url that leads to book.json in GCIS-DEV
+            HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json' .format(FILETYPE,IDEN)
+            HREFPAR = parse(HREF)
+        #Extracts book title and isbn from GCIS-DEV
+            d = dict(HREFPAR)
+            TITLE = d['title']
+            ISBNS = d['isbn']
+        #Cleans ISBNS to only conatian valid characters
+            CISBN = clean(ISBNS)
+        #V13 = validated canonical ISBN-13
+            V13 = EAN13(CISBN)
+            if V13 is None:
+                V13 = canonical(CISBN)
+            M = parse(HREF)
+
+            print("GCIS-DEV\n\n\t", M, '\n\n\t', "isbn_original:", ISBNS, '\n\n\t', "isbn_mod:", V13, "\n\n")
+
+        #DBpedia ISBN formats
+            a = ISBNS
+            b = canonical(CISBN)
+            c = to_isbn10(CISBN)
+            d = hyphenate(to_isbn10(CISBN))
+            e = to_isbn13(CISBN)
+            f = hyphenate(to_isbn13(CISBN))
+            g = V13
+            h = "ISBN {}" .format(CISBN)
+            i = "ISBN {}" .format(canonical(CISBN))
+            j = "ISBN {}" .format(hyphenate(to_isbn13(CISBN)))
+            k = "ISBN {}" .format(V13)
+            l = "ISBN {}" .format(to_isbn10(CISBN))
+            m = "ISBN {}" .format(hyphenate(to_isbn10(CISBN)))
+
+            tests = [a,b,c,d,e,f,g,h,i,j,k,l,m]
+
+            for indie in tests:
+                r = QUERY % indie
+                RQUERY(r)
+                if len(RQUERY(r)) != 0:
+                    print(RQUERY(r))
+                    break
+
+
+        except:
+            Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format(TITLE, ISBNS, IDEN)
+            print(Error)
+            file.write(Error)
+
+if __name__ =='__main__':
+    main()
+
+
+
diff --git a/google_isbn_validator copy.py → google_isbn_validator.py b/google_isbn_validator copy.py → google_isbn_validator.py
diff --git a/isbn-validator-post copy.py → isbn-validator-post.py b/isbn-validator-post copy.py → isbn-validator-post.py
diff --git a/isbn_normalization copy.py → isbn_normalization.py b/isbn_normalization copy.py → isbn_normalization.py
diff --git a/isbndb_isbn_validator copy.py → isbndb_isbn_validator.py b/isbndb_isbn_validator copy.py → isbndb_isbn_validator.py
diff --git a/loc_isbn_validator copy.py → loc_isbn_validator.py b/loc_isbn_validator copy.py → loc_isbn_validator.py
diff --git a/wcat_isbn_validator copy.py b/wcat_isbn_validator copy.py
diff --git a/wcat_isbn_validator.py b/wcat_isbn_validator.py
@@ -0,0 +1,75 @@
+__author__ = 'torresal'
+
+""""Note: this script is only intended to retrieve metadata from Wolrd Cat database"""
+
+import re,argparse, time
+from isbnlib import EAN13, clean, meta, canonical
+
+#ERROR file
+file = open("WCAT-ERRORS.txt", "w")
+file2 = open("WCAT-DATA.txt", "w")
+DATE = ("DATE:" + time.strftime("%m/%d/%Y"))
+TIME = ("MILITARY TIME:" + time.strftime("%H:%M:%S"))
+file.write(DATE+"\n"+TIME+"\n")
+file2.write(DATE+"\n"+TIME+"\n\n")
+
+#Parses url.json#
+def parse(url):
+    import requests
+    r = requests.get(url, verify = False)
+    JSONdict = r.json()
+    return JSONdict
+
+def main():
+#Commnd line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-path', '--GCIS', help = "Insert url path to GCIS book in JSON format [ex.'https://gcis-search-stage.jpl.net:3000/book.json?all=1'] ")
+    args = parser.parse_args()
+    GCIS = args.GCIS
+
+
+    if GCIS is None:
+        GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1'
+        print('NO MANUAL GCIS PATH\nALL GCIS BOOK JSON FORMATS WILL BE USED AS DEFAULT')
+
+GCISPAR = parse(GCIS)
+HREF =
+
+for x in range(len(GCISPAR)):
+#Extracts book identifier from GCIS#
+        IDEN = GCISPAR[x]["identifier"]
+        match =  re.search(r'.*/(.*?)\..*?$', GCIS)
+        if match:
+            FILETYPE = match.groups()[0]
+    #HREF = url that leads to book.json in GCIS-DEV
+        try:
+            HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json' .format(FILETYPE,IDEN)
+            #HREF = 'https://gcis-search-stage.jpl.net:3000/book/13b8b4fc-3de1-4bd8-82aa-7d3a6aa54ad5.json'
+            HREFPAR = parse(HREF)
+    #Extracts book title and isbn from GCIS-DEV
+            d = dict(HREFPAR)
+            TITLE = d['title']
+            ISBNS = d['isbn']
+    #Cleans ISBNS to only conatian valid characters
+            CISBN = clean(ISBNS)
+    #V13 = validated canonical ISBN-13
+            V13 = EAN13(CISBN)
+            if V13 is None:
+                V13 = canonical(CISBN)
+            M = parse(HREF)
+            v = meta(V13, service = 'wcat', cache ='default')
+            GCISDATA = "GCIS-DEV\n\n\t{}\n\n\tisbn_original:{}\n\n\tisbn_mod:{}\n\n" .format(M, ISBNS, V13)
+            APIDATA = "WorldCat\n\n\t{}\n\n------------\n\n" .format(v)
+            print("GCIS-DEV\n\n\t", M, '\n\n\t', "isbn_original:", ISBNS, '\n\n\t', "isbn_mod:", V13, "\n\n")
+            print ("WorldCat\n\n\t", v, '\n\n')
+            file2.write(GCISDATA)
+            file2.write(APIDATA)
+
+        except:
+            Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format(TITLE, ISBNS, IDEN)
+            print(Error)
+            file.write(Error)
+
+if __name__ =='__main__':
+    main()
+