Skip to content

Commit

Permalink
Handle CSVs with different encodings, closes #23
Browse files Browse the repository at this point in the history
  • Loading branch information
simonw committed Jun 9, 2022
1 parent 1166fce commit 383014c
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 12 deletions.
11 changes: 8 additions & 3 deletions datasette_upload_csvs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from datasette import hookimpl
from datasette.utils.asgi import Response, Forbidden
from charset_normalizer import detect
from starlette.requests import Request
from urllib.parse import quote_plus
import csv as csv_std
Expand Down Expand Up @@ -71,6 +72,12 @@ async def upload_csvs(scope, receive, datasette, request):
total_size = get_temporary_file_size(csv.file)
task_id = str(uuid.uuid4())

# Use the first 2MB to detect the character encoding
first_bytes = csv.file.read(2048)
csv.file.seek(0)
encoding = detect(first_bytes)["encoding"]
print(encoding)

def insert_initial_record(conn):
database = sqlite_utils.Database(conn)
database["_csv_progress_"].insert(
Expand All @@ -91,9 +98,7 @@ def insert_initial_record(conn):
await db.execute_write_fn(insert_initial_record)

def insert_docs(database):

# TODO: Support other encodings:
reader = csv_std.reader(codecs.iterdecode(csv.file, "utf-8"))
reader = csv_std.reader(codecs.iterdecode(csv.file, encoding))
headers = next(reader)

docs = (dict(zip(headers, row)) for row in reader)
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def get_long_description():
"starlette",
"aiofiles",
"python-multipart",
"charset-normalizer",
"sqlite-utils",
],
extras_require={
Expand Down
57 changes: 48 additions & 9 deletions tests/test_datasette_upload_csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,55 @@ async def test_menu(auth):
assert "/-/upload-csvs" not in response.text


SIMPLE = b"name,age\nCleo,5\nPancakes,4"
SIMPLE_EXPECTED = [{"name": "Cleo", "age": "5"}, {"name": "Pancakes", "age": "4"}]
NOT_UTF8 = (
b"IncidentNumber,DateTimeOfCall,CalYear,FinYear,TypeOfIncident,PumpCount,PumpHoursTotal,HourlyNotionalCost(\xa3),IncidentNotionalCost(\xa3)\r\n"
b"139091,01/01/2009 03:01,2009,2008/09,Special Service,1,2,255,510\r\n"
b"275091,01/01/2009 08:51,2009,2008/09,Special Service,1,1,255,255"
)
NOT_UTF8_EXPECTED = [
{
"IncidentNumber": "139091",
"DateTimeOfCall": "01/01/2009 03:01",
"CalYear": "2009",
"FinYear": "2008/09",
"TypeOfIncident": "Special Service",
"PumpCount": "1",
"PumpHoursTotal": "2",
"HourlyNotionalCost(£)": "255",
"IncidentNotionalCost(£)": "510",
},
{
"IncidentNumber": "275091",
"DateTimeOfCall": "01/01/2009 08:51",
"CalYear": "2009",
"FinYear": "2008/09",
"TypeOfIncident": "Special Service",
"PumpCount": "1",
"PumpHoursTotal": "1",
"HourlyNotionalCost(£)": "255",
"IncidentNotionalCost(£)": "255",
},
]


@pytest.mark.asyncio
@pytest.mark.parametrize(
"filename,expected_url",
"filename,content,expected_url,expected_rows",
(
("dogs.csv", "/data/dogs"),
("weird ~ filename here.csv.csv", "/data/weird~20~7E~20filename~20here~2Ecsv"),
("dogs.csv", SIMPLE, "/data/dogs", SIMPLE_EXPECTED),
(
"weird ~ filename here.csv.csv",
SIMPLE,
"/data/weird~20~7E~20filename~20here~2Ecsv",
SIMPLE_EXPECTED,
),
("not-utf8.csv", NOT_UTF8, "/data/not-utf8", NOT_UTF8_EXPECTED),
),
)
@pytest.mark.parametrize("use_xhr", (True, False))
async def test_upload(tmpdir, filename, expected_url, use_xhr):
async def test_upload(tmpdir, filename, content, expected_url, expected_rows, use_xhr):
path = str(tmpdir / "data.db")
db = sqlite_utils.Database(path)

Expand All @@ -72,7 +111,7 @@ async def test_upload(tmpdir, filename, expected_url, use_xhr):
cookies["ds_csrftoken"] = csrftoken

# Now try uploading a file
files = {"csv": (filename, b"name,age\nCleo,5\nPancakes,4", "text/csv")}
files = {"csv": (filename, content, "text/csv")}
response = await client.post(
"http://localhost/-/upload-csvs",
cookies=cookies,
Expand All @@ -94,13 +133,13 @@ async def test_upload(tmpdir, filename, expected_url, use_xhr):
assert 1 == len(rows)
assert {
"filename": filename[:-4], # Strip off .csv ending
"bytes_todo": 26,
"bytes_done": 26,
"bytes_todo": len(content),
"bytes_done": len(content),
"rows_done": 2,
}.items() <= rows[0].items()

dogs = list(db[filename[:-4]].rows)
assert [{"name": "Cleo", "age": "5"}, {"name": "Pancakes", "age": "4"}] == dogs
rows = list(db[filename[:-4]].rows)
assert rows == expected_rows


@pytest.mark.asyncio
Expand Down

0 comments on commit 383014c

Please sign in to comment.