From 73c0553a1e1544b04b7440b590ad2903155e6d71 Mon Sep 17 00:00:00 2001 From: MacHu-GWU Date: Mon, 8 Aug 2016 16:26:05 -0400 Subject: [PATCH] 2016-08-08 1. Improved sort by distance method. 2. Add new test case for multiple sort_by keywords. --- source/index.rst | 12 ++++ tests/test_searchengine.py | 116 +++++++++++++++++++++++-------------- uszipcode/__init__.py | 2 +- uszipcode/searchengine.py | 86 ++++++++++++++------------- 4 files changed, 133 insertions(+), 83 deletions(-) diff --git a/source/index.rst b/source/index.rst index bac451585f..932104799b 100644 --- a/source/index.rst +++ b/source/index.rst @@ -143,6 +143,18 @@ You can search zipcode by city name. 'Vienna' +**uszipcode also provide a internal method to help you find correct city name**:: + +.. code-block: python + + >>> search._find_city("phonix", bes_match=True) + ['Phoenix'] + + # Find city in kensas state, state name is also typo tolerant + >>> search._find_city("kersen", state="kensas", best_match=False) + city_expected = ["Nickerson", ] + + .. _by_state: Search by State diff --git a/tests/test_searchengine.py b/tests/test_searchengine.py index 0ff4dd23e9..c221d4bd5d 100644 --- a/tests/test_searchengine.py +++ b/tests/test_searchengine.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import pytest +from collections import OrderedDict from uszipcode.searchengine import Zipcode, ZipcodeSearchEngine from uszipcode.packages.haversine import great_circle @@ -11,17 +12,19 @@ def is_all_ascending(array): """ for i, j in zip(array[1:], array[:-1]): if (i is not None) and (j is not None): - assert i - j >= 0 + assert i >= j + def is_all_descending(array): """Assert that this is a strictly desceding array. """ for i, j in zip(array[1:], array[:-1]): if (i is not None) and (j is not None): - assert i - j <= 0 + assert i <= j class TestZipcode(object): + def test_init(self): z = Zipcode(Zipcode="10001") assert z.Zipcode == "10001" @@ -69,6 +72,7 @@ def test_output(self): class TestZipcodeSearchEngine(object): + def test_sql_create_order_by(self): with ZipcodeSearchEngine() as search: sql = search._sql_create_order_by("Zipcode", True) @@ -94,17 +98,18 @@ def test_sql_create_lower_upper(self): with pytest.raises(ValueError): sql = search._sql_create_lower_upper("Population", None, None) with pytest.raises(ValueError): - sql = search._sql_create_lower_upper("Population", "SQL", "SQL") - + sql = search._sql_create_lower_upper( + "Population", "SQL", "SQL") + sql = search._sql_create_lower_upper("Population", 0, None) assert sql == "Population >= 0" - + sql = search._sql_create_lower_upper("Population", None, 999999) assert sql == "Population <= 999999" - + sql = search._sql_create_lower_upper("Population", 0, 999999) assert sql == "Population >= 0 AND Population <= 999999" - + def test_search_by_zipcode(self): with ZipcodeSearchEngine() as search: for zipcode in [10001, "10001"]: @@ -120,17 +125,20 @@ def test_search_by_coordinate(self): with ZipcodeSearchEngine() as search: # 在马里兰选一个坐标, 返回1000条, 但实际上不到1000条 lat, lng = 39.114407, -77.205758 - + # 返回的结果必须按照距离是从小到大的 res1 = search.by_coordinate(lat, lng, ascending=True, returns=1000) len(res1) < 1000 - dist_array = [great_circle((lat, lng), (z.Latitude, z.Longitude), miles=True) for z in res1] + dist_array = [ + great_circle((lat, lng), (z.Latitude, z.Longitude), miles=True) for z in res1] is_all_ascending(dist_array) - - res2 = search.by_coordinate(lat, lng, ascending=False, returns=1000) - dist_array = [great_circle((lat, lng), (z.Latitude, z.Longitude), miles=True) for z in res2] + + res2 = search.by_coordinate( + lat, lng, ascending=False, returns=1000) + dist_array = [ + great_circle((lat, lng), (z.Latitude, z.Longitude), miles=True) for z in res2] is_all_descending(dist_array) - + # 当returns = 0时, 返回所有符合条件的 res3 = search.by_coordinate(lat, lng, returns=0) assert len(res1) == len(res3) @@ -138,32 +146,38 @@ def test_search_by_coordinate(self): # 当没有符合条件的zipcode时, 返回空列表 res3 = search.by_coordinate(lat, lng, radius=-1) assert len(res3) == 0 - + def test_find_state(self): with ZipcodeSearchEngine() as search: assert search._find_state("mary", best_match=True) == ["MD", ] - + result = set(search._find_state("virgin", best_match=False)) assert result == set(["VI", "WV", "VA"]) - + assert search._find_state("newyork", best_match=False) == ["NY", ] - + with pytest.raises(ValueError): search._find_state("THIS IS NOT A STATE!", best_match=True) - + with pytest.raises(ValueError): search._find_state("THIS IS NOT A STATE!", best_match=False) - + def test_find_city(self): with ZipcodeSearchEngine() as search: - assert search._find_city("phonix", best_match=True) == [ - "Phoenix", ] - assert search._find_city("kerson", best_match=False) == [ - "Dickerson Run", "Dickerson", "Nickerson", "Emerson", "Everson" - ] - assert search._find_city("kersen", state="kensas", best_match=False) == [ - "Nickerson", ] - + city_result = search._find_city("phonix", best_match=True) + city_expected = ["Phoenix", ] + assert city_result == city_expected + + city_result = search._find_city("kerson", best_match=False) + city_result.sort() + city_expected = ["Dickerson", "Dickerson Run", "Emerson", "Ericson", "Everson", "Nickerson"] + for city in city_result: + assert city in city_expected + + city_result = search._find_city("kersen", state="kensas", best_match=False) + city_expected = ["Nickerson", ] + assert city_result == city_expected + def test_by_city_and_state(self): with ZipcodeSearchEngine() as search: # Arlington, VA @@ -172,11 +186,11 @@ def test_by_city_and_state(self): z.City == "Arlington" z.State == "VA" assert len(res) == 5 - + # There's no city in VI with pytest.raises(ValueError): search.by_city_and_state(city="Arlington", state="vi") - + def test_by_city(self): with ZipcodeSearchEngine() as search: res = search.by_city("vienna") @@ -185,7 +199,7 @@ def test_by_city(self): assert z.City == "Vienna" s.add(z.State) assert s == set(["ME", "MD", "VA"]) - + def test_by_state(self): with ZipcodeSearchEngine() as search: res = search.by_state("RI") @@ -204,7 +218,7 @@ def test_by_prefix(self): sort_by=sort_key, ascending=True, returns=0) l = list() for z in res: - assert z.Zipcode.startswith(prefix) # example prefix + assert z.Zipcode.startswith(prefix) # example prefix l.append(z[sort_key]) l_sorted = list(l) l_sorted.sort() @@ -253,8 +267,25 @@ def test_by_house(self): res = search.by_house(lower=20000, sort_by="HouseOfUnits", ascending=False, returns=0) assert len(res) == 741 - - def test_find(self): + + def test_sort_by_multiple_keywords(self): + with ZipcodeSearchEngine() as search: + res = search.by_state( + state="CA", sort_by=["City", "Zipcode"], ascending=[True, True], returns=1000) + + stat = OrderedDict() + for zipcode in res: + try: + stat[zipcode.City].append(zipcode.Zipcode) + except: + stat[zipcode.City] = [zipcode.Zipcode, ] + + city_list = list(stat.keys()) + is_all_ascending(city_list) + for zipcode_list in stat.values(): + is_all_ascending(list(zipcode_list)) + + def test_find(self): with ZipcodeSearchEngine() as search: # Find most people living zipcode in New York res = search.find( @@ -262,7 +293,7 @@ def test_find(self): sort_by="Population", ascending=False, ) is_all_descending([z.Population for z in res]) - + # Find all zipcode in California that prefix is "999" res = search.find( state="califor", @@ -275,12 +306,12 @@ def test_find(self): assert z.State == "CA" assert z.Zipcode.startswith("95") is_all_descending([z.HouseOfUnits for z in res]) - + # Find top 10 richest zipcode near Silicon Valley lat, lng = 37.391184, -122.082235 radius = 100 res = search.find( - lat=lat, + lat=lat, lng=lng, radius=radius, sort_by="Wealthy", ascending=False, @@ -288,15 +319,16 @@ def test_find(self): ) assert len(res) == 10 for z in res: - assert great_circle((lat, lng), (z.Latitude, z.Longitude)) <= radius + assert great_circle( + (lat, lng), (z.Latitude, z.Longitude)) <= radius is_all_descending([z.Wealthy for z in res]) - - # Find zipcode that average personal annual income greater than - # 100000 near Silicon Valley, order by distance + + # Find zipcode that average personal annual income greater than + # 100000 near Silicon Valley, order by distance lat, lng = 37.391184, -122.082235 radius = 100 res = search.find( - lat=lat, + lat=lat, lng=lng, radius=radius, wealthy_lower=60000, @@ -309,7 +341,7 @@ def test_find(self): is_all_ascending([ great_circle((lat, lng), (z.Latitude, z.Longitude)) for z in res ]) - + def test_edge_case(self): with ZipcodeSearchEngine() as search: zipcode = search.by_zipcode(00000) diff --git a/uszipcode/__init__.py b/uszipcode/__init__.py index 5e7647cee4..07112f9c34 100644 --- a/uszipcode/__init__.py +++ b/uszipcode/__init__.py @@ -8,7 +8,7 @@ print(e) -__version__ = "0.1.2" +__version__ = "0.1.3" __short_description__ = ("USA zipcode programmable database, includes " "up-to-date census and geometry information.") __license__ = "MIT" diff --git a/uszipcode/searchengine.py b/uszipcode/searchengine.py index 0c067b050b..178b1fb4b2 100644 --- a/uszipcode/searchengine.py +++ b/uszipcode/searchengine.py @@ -4,7 +4,7 @@ import json import math import sqlite3 -from heapq import heappush, heappop +from heapq import heappush, heappop, nlargest, nsmallest from functools import total_ordering from collections import OrderedDict @@ -24,6 +24,7 @@ @total_ordering class Zipcode(object): + """Zipcode data container class. Attributes: @@ -85,7 +86,8 @@ def __init__(self, # estimate population per square miles (on land only) Density=None, TotalWages=None, # estimate annual total wage - Wealthy=None, # estimate average annual wage = TotalWages/Population + # estimate average annual wage = TotalWages/Population + Wealthy=None, HouseOfUnits=None, # estimate number of house unit LandArea=None, # land area in square miles WaterArea=None, # marine area in square miles @@ -205,6 +207,7 @@ def __hash__(self): class ZipcodeSearchEngine(object): + """A fast, powerful index optimized zipcode object search engine class. Quick links: @@ -387,7 +390,7 @@ def export_to_csv(self, res, abspath): def _find_state(self, state, best_match=True): """Fuzzy search correct state. - :param multiple: bool, when False, only one state will return. + :param best_match: bool, when True, only one state will return. otherwise, will return all matching states. """ result = list() @@ -417,6 +420,11 @@ def _find_state(self, state, best_match=True): def _find_city(self, city, state=None, best_match=True): """Fuzzy search correct city. + :param city: city name. + :param state: search city in specified state. + :param best_match: bool, when True, only one city will return. + otherwise, will return all matching cities. + **中文文档** 如果给定了state, 则只在state里的城市中寻找, 否则, 在全国所有的城市中 @@ -485,6 +493,7 @@ def by_coordinate(self, lat, lng, radius=50.0, ascending=True, standard_only=Tru :param lng: center lngitude :param radius: for the inside implementation only, search zipcode within #radius units of lat, lng + :param ascending: bool, if True, sort by distance from closest :param standard_only: bool, default True, only returns standard type zipcode :param returns: returns at most how many results @@ -513,7 +522,7 @@ def by_city_and_state(self, city, state, :param state: 2 letter short name or long name. :param standard_only: bool, default True, only returns standard type zipcode - :param sortby: str or list of str, default ``"Zipcode"`` + :param sort_by: str or list of str, default ``"Zipcode"`` :param ascending: bool or list of bool, default True :param returns: int, default 5 @@ -540,7 +549,7 @@ def by_city(self, city, :param city: city name. :param standard_only: bool, default True, only returns standard type zipcode - :param sortby: str or list of str, default ``"Zipcode"`` + :param sort_by: str or list of str, default ``"Zipcode"`` :param ascending: bool or list of bool, default True :param returns: int, default 5 @@ -567,7 +576,7 @@ def by_state(self, state, :param state: 2 letter short name or long name. :param standard_only: bool, default True, only returns standard type zipcode - :param sortby: str or list of str, default ``"Zipcode"`` + :param sort_by: str or list of str, default ``"Zipcode"`` :param ascending: bool or list of bool, default True :param returns: int, default 5 @@ -591,7 +600,7 @@ def by_prefix(self, prefix, :param prefix: first N zipcode number :param standard_only: bool, default True, only returns standard type zipcode - :param sortby: str or list of str, default ``"Zipcode"`` + :param sort_by: str or list of str, default ``"Zipcode"`` :param ascending: bool or list of bool, default True :param returns: int, default 5 @@ -600,10 +609,10 @@ def by_prefix(self, prefix, 根据Zipcode的前面几个字符模糊查询。 """ # exam input - return self.find(prefix=prefix, - standard_only=standard_only, - sort_by=sort_by, - ascending=ascending, + return self.find(prefix=prefix, + standard_only=standard_only, + sort_by=sort_by, + ascending=ascending, returns=returns) def by_pattern(self, pattern, @@ -616,7 +625,7 @@ def by_pattern(self, pattern, :param prefix: first N zipcode number :param standard_only: bool, default True, only returns standard type zipcode - :param sortby: str or list of str, default ``"Zipcode"`` + :param sort_by: str or list of str, default ``"Zipcode"`` :param ascending: bool or list of bool, default True :param returns: int, default 5 @@ -641,7 +650,7 @@ def by_population(self, lower=-1, upper=2**30, :param upper: maximum population :param standard_only: bool, default True, only returns standard type zipcode - :param sortby: str or list of str, default ``"Zipcode"`` + :param sort_by: str or list of str, default ``"Zipcode"`` :param ascending: bool or list of bool, default True :param returns: int, default 5 @@ -669,7 +678,7 @@ def by_density(self, lower=-1, upper=2**30, :param upper: maximum population :param standard_only: bool, default True, only returns standard type zipcode - :param sortby: str or list of str, default ``"Zipcode"`` + :param sort_by: str or list of str, default ``"Zipcode"`` :param ascending: bool or list of bool, default True :param returns: int, default 5 @@ -695,7 +704,7 @@ def by_landarea(self, lower=-1, upper=2**30, :param upper: maximum landarea in sqrt miles :param standard_only: bool, default True, only returns standard type zipcode - :param sortby: str or list of str, default ``"Zipcode"`` + :param sort_by: str or list of str, default ``"Zipcode"`` :param ascending: bool or list of bool, default True :param returns: int, default 5 @@ -721,7 +730,7 @@ def by_waterarea(self, lower=-1, upper=2**30, :param upper: maximum waterarea in sqrt miles :param standard_only: bool, default True, only returns standard type zipcode - :param sortby: str or list of str, default ``"Zipcode"`` + :param sort_by: str or list of str, default ``"Zipcode"`` :param ascending: bool or list of bool, default True :param returns: int, default 5 @@ -747,7 +756,7 @@ def by_totalwages(self, lower=-1, upper=2**30, :param upper: maximum total annual wages :param standard_only: bool, default True, only returns standard type zipcode - :param sortby: str or list of str, default ``"Zipcode"`` + :param sort_by: str or list of str, default ``"Zipcode"`` :param ascending: bool or list of bool, default True :param returns: int, default 5 @@ -775,7 +784,7 @@ def by_wealthy(self, lower=-1, upper=2**30, :param upper: maximum AAW :param standard_only: bool, default True, only returns standard type zipcode - :param sortby: str or list of str, default ``"Zipcode"`` + :param sort_by: str or list of str, default ``"Zipcode"`` :param ascending: bool or list of bool, default True :param returns: int, default 5 @@ -801,7 +810,7 @@ def by_house(self, lower=-1, upper=2**30, :param upper: maximum house of units :param standard_only: bool, default True, only returns standard type zipcode - :param sortby: str or list of str, default ``"Zipcode"`` + :param sort_by: str or list of str, default ``"Zipcode"`` :param ascending: bool or list of bool, default True :param returns: int, default 5 @@ -959,7 +968,7 @@ def find(self, select_sql = self._sql_modify_standard_only(select_sql, standard_only) select_sql = self._sql_modify_order_by(select_sql, sort_by, ascending) - #--- solve coordinate and other search sort-by conflict --- + #--- solve coordinate and other search sort_by conflict --- if flag_by_coordinate: # has sort_by keyword, order by keyword # 有sort_by关键字的情况下, 按关键字排序 @@ -976,28 +985,25 @@ def find(self, # 没有sort_by关键字, 按距离远近排序 else: # use heap sort find top N closest zipcode - heap = list() + def gen(): + for row in self.cursor.execute(select_sql): + dist = great_circle( + (row["Latitude"], row["Longitude"]), (lat, lng)) + if dist <= radius: + yield (dist, row) - for row in self.cursor.execute(select_sql): - dist = great_circle( - (row["Latitude"], row["Longitude"]), (lat, lng)) - if dist <= radius: - heappush(heap, (dist, row)) - - # generate results - res = list() if returns >= 1: - try: - for i in range(returns): - res.append(Zipcode(**heappop(heap)[1])) - except IndexError: - pass - elif returns == 0: - while heap: - res.append(Zipcode(**heappop(heap)[1])) - - if ascending is False: # 按距离逆序输出 - res = res[::-1] + + if ascending: + data = nsmallest(returns, gen(), key=lambda x: x[0]) + else: + data = nlargest(returns, gen(), key=lambda x: x[0]) + else: + if ascending: + data = sorted( + gen(), key=lambda x: x[0], reverse=not ascending) + + res = [Zipcode(**row) for dist, row in data] else: select_sql = self._sql_modify_limit(select_sql, returns) res = [Zipcode(**row) for row in self.cursor.execute(select_sql)]