closes #14, crash on non-utf8 py files, could actually work now

tarpas · Jan 5, 2018 · 1e6a280 · 1e6a280
1 parent ba69b81
commit 1e6a280
Show file tree

Hide file tree

Showing 8 changed files with 80 additions and 40 deletions.
diff --git a/test/print1250r.py b/test/print1250r.py
diff --git a/test/samples/__init__.py b/test/samples/__init__.py
diff --git a/test/samples/print1250r.py b/test/samples/print1250r.py
@@ -0,0 +1,3 @@
+# -*- coding: cp1250 -*-
+
+print("š")
diff --git a/test/test_core.py b/test/test_core.py
@@ -1,10 +1,9 @@
 import pytest
 from collections import namedtuple
 
-from testmon.process_code import Module
+from testmon.process_code import Module, read_file_with_checksum
 from test.test_process_code import CodeSample
-from testmon.testmon_core import TestmonData as CoreTestmonData, SourceTree, flip_dictionary, unaffected, \
-    read_file_with_checksum
+from testmon.testmon_core import TestmonData as CoreTestmonData, SourceTree, flip_dictionary, unaffected
 
 pytest_plugins = "pytester",
 

diff --git a/test/test_process_code.py b/test/test_process_code.py
@@ -1,13 +1,53 @@
+#  -- coding:utf8 --
+
 from test.coveragepy.coveragetest import CoverageTest
 
 import pytest
-from testmon.process_code import Block, Module, checksum_coverage
+from testmon.process_code import Block, Module, checksum_coverage, read_file_with_checksum, process_encoding
+try:
+    from StringIO import StringIO as MemFile
+except ImportError:
+    from io import BytesIO as MemFile
 
 
 def parse(source_code, file_name='a.py'):
     return Module(source_code=source_code, file_name=file_name).blocks
 
 
+def test_detect_encoding1():
+    lines = []
+    output = MemFile(b'#first comment\n#  -- coding: abcd --')
+    assert process_encoding(lines, output) == None
+    assert lines == [b'#first comment\n']
+    assert process_encoding(lines, output) == 'abcd'
+    assert lines == [b'#first comment\n']
+
+
+def test_detect_encoding2():
+    lines = []
+    output = MemFile(b'1\n2\n')
+    assert process_encoding(lines, output) == None
+    assert lines == [b'1\n']
+    assert process_encoding(lines, output) == None
+    assert lines == [b'1\n', b'2\n']
+
+
+def test_detect_encoding2():
+    with open('test/samples/print1250r.py', 'rb') as f:
+        lines = []
+        process_encoding(lines, f) == 'cp1250'
+        assert lines == []
+
+
+def test_read_file_with_checksum():
+    assert u'š' in read_file_with_checksum('test/samples/print1250r.py')[0]
+
+
+def test_module_with_1250():
+    code_repr = Module(None, 'test/samples/print1250r.py').blocks[0].code
+    assert "Str('\\xc5\\xa1')" in code_repr or "Str('š')" in Module(None, 'test/samples/print1250r.py').blocks[0].code
+
+
 class TestSourceIntoBlocks(object):
 
     def test_empty(self):
@@ -253,7 +293,6 @@ def test_classes(self):
         assert module1.blocks[1] != module2.blocks[1]
         assert module1.blocks[2] == module2.blocks[2]
 
-
     def test_classes_header(self):
         module1 = Module(code_samples['classes'].source_code)
         module2 = Module(code_samples['classes_c'].source_code)
@@ -277,5 +316,5 @@ def test_easy(self):
         for name, mod_cov in code_samples.items():
             if mod_cov.expected_coverage:
                 self.check_coverage(mod_cov.source_code,
-                                    cov_data = mod_cov.expected_coverage,
+                                    cov_data=mod_cov.expected_coverage,
                                     msg="This is for code_sample['{}']".format(name))
diff --git a/test/test_testmon.py b/test/test_testmon.py
@@ -2,6 +2,7 @@
 import sys
 
 import pytest
+import testmon.process_code
 from test.coveragepy import coveragetest
 from testmon.process_code import Module, checksum_coverage
 from testmon.testmon_core import eval_variant
@@ -103,11 +104,6 @@ def func():
     assert {os.path.relpath(a.strpath, testdir.tmpdir.strpath):
                 checksum_coverage(Module(file_name=a.strpath).blocks, [2])} == deps
 
-def test_detect_encoding():
-    from testmon import testmon_core
-    with open('test/print1250r.py', 'rb') as f:
-        testmon_core.detect_encoding(f.readline() + f.readline()) == 'cp1250'
-
 
 @pytest.mark.xfail
 def test_testmon_recursive(testdir, monkeypatch):

diff --git a/testmon/process_code.py b/testmon/process_code.py
@@ -1,8 +1,13 @@
 import ast
+import hashlib
 import textwrap
 import zlib
 import os
 
+import re
+
+coding_re = re.compile(b'coding[=:]\s*([-\w.]+)')
+
 
 class Block():
     def __init__(self, start, end, code=0, name=''):
@@ -44,15 +49,14 @@ def __init__(self, source_code=None, file_name='<unknown>', rootdir=''):
         self.blocks = []
         self.counter = 0
         if source_code is None:
-            with open(os.path.join(rootdir, file_name)) as f:
-                source_code = f.read()
+            source_code, _ = read_file_with_checksum(os.path.join(rootdir, file_name))
         else:
             source_code = textwrap.dedent(source_code)
         lines = source_code.splitlines()
         try:
             tree = ast.parse(source_code, file_name)
             self.dump_and_block(tree, len(lines), name=file_name)
-        except SyntaxError:
+        except SyntaxError as e:
             pass
 
     def dump_and_block(self, node, end, name='unknown', into_block=False):
@@ -122,3 +126,27 @@ def checksum_coverage(blocks, lines):
             break
 
     return result
+
+
+def process_encoding(lines, afile):
+    line = afile.readline()
+    match = coding_re.search(line)
+    if match:
+        return match.group(1).decode('ascii')
+    else:
+        lines.append(line)
+        return None
+
+
+def read_file_with_checksum(absfilename):
+    hasher = hashlib.sha1()
+    with open(absfilename, 'rb') as afile:
+        lines = []
+        encoding = process_encoding(lines, afile)
+        if not encoding:
+            encoding = process_encoding(lines, afile)
+        if not encoding:
+            encoding = 'utf8'
+        source = b''.join(lines) + afile.read()
+    hasher.update(source)
+    return source.decode(encoding), hasher.hexdigest()
diff --git a/testmon/testmon_core.py b/testmon/testmon_core.py
@@ -14,12 +14,9 @@
 
 import coverage
 
-from testmon.process_code import checksum_coverage
+from testmon.process_code import checksum_coverage, read_file_with_checksum
 from testmon.process_code import Module
-import codecs
-import re
 
-coding_re = re.compile(b'coding[=:]\s*([-\w.]+)')
 
 if sys.version_info > (3,):
     buffer = memoryview
@@ -151,24 +148,6 @@ def get_variant_inifile(inifile):
     return eval_variant(run_variant_expression)
 
 
-def read_file_with_checksum(absfilename):
-    hasher = hashlib.sha1()
-    with open(absfilename, 'rb') as afile:
-        source = b''.join([afile.readline(), afile.readline()])
-        encoding = detect_encoding(source)
-        source = source + afile.read()
-    hasher.update(source)
-    return source.decode(encoding), hasher.hexdigest()
-
-
-def detect_encoding(beginning):
-    result = coding_re.search(beginning)
-    if result:
-        return result.group(1).decode('ascii')
-    else:
-        return 'utf-8'
-
-
 def parse_file(filename, rootdir, source_code):
     return Module(source_code=source_code, file_name=filename, rootdir=rootdir)