Merge remote-tracking branch 'upstream/master' into Validate-blanks

pandas-dev · Sep 19, 2019 · 689c59a · 689c59a
2 parents 1fc9b0f + 4ac7f9d
commit 689c59a
Show file tree

Hide file tree

Showing 27 changed files with 167 additions and 254 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,3 +15,4 @@ repos:
     hooks:
     -   id: isort
         language: python_venv
+        exclude: ^pandas/__init__\.py$|^pandas/core/api\.py$
diff --git a/README.md b/README.md
@@ -225,7 +225,7 @@ Most development discussion is taking place on github in this repo. Further, the
 
 All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome.
 
-A detailed overview on how to contribute can be found in the **[contributing guide](https://dev.pandas.io/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub.
+A detailed overview on how to contribute can be found in the **[contributing guide](https://dev.pandas.io/docs/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub.
 
 If you are simply looking to start working with the pandas codebase, navigate to the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [good first issue](https://github.com/pandas-dev/pandas/issues?labels=good+first+issue&sort=updated&state=open) where you could start out.
 

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -104,7 +104,7 @@ jobs:
     displayName: 'Running benchmarks'
     condition: true
 
-- job: 'Docs'
+- job: 'Web_and_Docs'
   pool:
     vmImage: ubuntu-16.04
   timeoutInMinutes: 90
@@ -119,6 +119,11 @@ jobs:
       ci/setup_env.sh
     displayName: 'Setup environment and build pandas'
 
+  - script: |
+      source activate pandas-dev
+      python web/pandas_web.py web/pandas --target-path=web/build
+    displayName: 'Build website'
+
   - script: |
       source activate pandas-dev
       # Next we should simply have `doc/make.py --warnings-are-errors`, everything else is required because the ipython directive doesn't fail the build on errors (https://github.com/ipython/ipython/issues/11547)
@@ -128,15 +133,21 @@ jobs:
     displayName: 'Build documentation'
 
   - script: |
-      cd doc/build/html
+      mkdir -p to_deploy/docs
+      cp -r web/build/* to_deploy/
+      cp -r doc/build/html/* to_deploy/docs/
+    displayName: 'Merge website and docs'
+
+  - script: |
+      cd to_deploy
       git init
       touch .nojekyll
       echo "dev.pandas.io" > CNAME
       printf "User-agent: *\nDisallow: /" > robots.txt
       git add --all .
       git config user.email "[email protected]"
-      git config user.name "pandas-docs-bot"
-      git commit -m "pandas documentation in master"
+      git config user.name "pandas-bot"
+      git commit -m "pandas web and documentation in master"
     displayName: 'Create git repo for docs build'
     condition : |
       and(not(eq(variables['Build.Reason'], 'PullRequest')),
@@ -160,10 +171,10 @@ jobs:
           eq(variables['Build.SourceBranch'], 'refs/heads/master'))
 
   - script: |
-      cd doc/build/html
+      cd to_deploy
       git remote add origin [email protected]:pandas-dev/pandas-dev.github.io.git
       git push -f origin master
-    displayName: 'Publish docs to GitHub pages'
+    displayName: 'Publish web and docs to GitHub pages'
     condition : |
       and(not(eq(variables['Build.Reason'], 'PullRequest')),
           eq(variables['Build.SourceBranch'], 'refs/heads/master'))
diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml
@@ -60,15 +60,21 @@ jobs:
         echo "Creating Environment"
         ci/setup_env.sh
       displayName: 'Setup environment and build pandas'
+
     - script: |
         source activate pandas-dev
         ci/run_tests.sh
       displayName: 'Test'
+
     - script: source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd
+      displayName: 'Build versions'
+
     - task: PublishTestResults@2
       inputs:
         testResultsFiles: 'test-data-*.xml'
         testRunTitle: ${{ format('{0}-$(CONDA_PY)', parameters.name) }}
+      displayName: 'Publish test results'
+
     - powershell: |
         $junitXml = "test-data-single.xml"
         $(Get-Content $junitXml | Out-String) -match 'failures="(.*?)"'
@@ -94,6 +100,7 @@ jobs:
           Write-Error "$($matches[1]) tests failed"
         }
       displayName: 'Check for test failures'
+
     - script: |
         source activate pandas-dev
         python ci/print_skipped.py

diff --git a/ci/print_skipped.py b/ci/print_skipped.py
@@ -1,52 +1,40 @@
 #!/usr/bin/env python
-
-import math
 import os
-import sys
 import xml.etree.ElementTree as et
 
 
-def parse_results(filename):
+def main(filename):
+    if not os.path.isfile(filename):
+        return
+
     tree = et.parse(filename)
     root = tree.getroot()
-    skipped = []
-
     current_class = ""
-    i = 1
-    assert i - 1 == len(skipped)
     for el in root.findall("testcase"):
         cn = el.attrib["classname"]
         for sk in el.findall("skipped"):
             old_class = current_class
             current_class = cn
-            name = "{classname}.{name}".format(
-                classname=current_class, name=el.attrib["name"]
-            )
-            msg = sk.attrib["message"]
-            out = ""
             if old_class != current_class:
-                ndigits = int(math.log(i, 10) + 1)
-
-                # 4 for : + space + # + space
-                out += "-" * (len(name + msg) + 4 + ndigits) + "\n"
-            out += "#{i} {name}: {msg}".format(i=i, name=name, msg=msg)
-            skipped.append(out)
-            i += 1
-            assert i - 1 == len(skipped)
-    assert i - 1 == len(skipped)
-    # assert len(skipped) == int(root.attrib['skip'])
-    return "\n".join(skipped)
-
-
-def main():
-    test_files = ["test-data-single.xml", "test-data-multiple.xml", "test-data.xml"]
-
-    print("SKIPPED TESTS:")
-    for fn in test_files:
-        if os.path.isfile(fn):
-            print(parse_results(fn))
-    return 0
+                yield None
+            yield {
+                "class_name": current_class,
+                "test_name": el.attrib["name"],
+                "message": sk.attrib["message"],
+            }
 
 
 if __name__ == "__main__":
-    sys.exit(main())
+    print("SKIPPED TESTS:")
+    i = 1
+    for file_type in ("-single", "-multiple", ""):
+        for test_data in main("test-data{}.xml".format(file_type)):
+            if test_data is None:
+                print("-" * 80)
+            else:
+                print(
+                    "#{i} {class_name}.{test_name}: {message}".format(
+                        **dict(test_data, i=i)
+                    )
+                )
+                i += 1
diff --git a/ci/run_tests.sh b/ci/run_tests.sh
@@ -1,13 +1,6 @@
-#!/bin/bash
+#!/bin/bash -e
 
-set -e
-
-if [ "$DOC" ]; then
-    echo "We are not running pytest as this is a doc-build"
-    exit 0
-fi
-
-# Workaround for pytest-xdist flaky collection order
+# Workaround for pytest-xdist (it collects different tests in the workers if PYTHONHASHSEED is not set)
 # https://github.com/pytest-dev/pytest/issues/920
 # https://github.com/pytest-dev/pytest/issues/1075
 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))')
@@ -16,7 +9,7 @@ if [ -n "$LOCALE_OVERRIDE" ]; then
     export LC_ALL="$LOCALE_OVERRIDE"
     export LANG="$LOCALE_OVERRIDE"
     PANDAS_LOCALE=`python -c 'import pandas; pandas.get_option("display.encoding")'`
-    if [[ "$LOCALE_OVERIDE" != "$PANDAS_LOCALE" ]]; then
+    if [[ "$LOCALE_OVERRIDE" != "$PANDAS_LOCALE" ]]; then
         echo "pandas could not detect the locale. System locale: $LOCALE_OVERRIDE, pandas detected: $PANDAS_LOCALE"
         # TODO Not really aborting the tests until https://github.com/pandas-dev/pandas/issues/23923 is fixed
         # exit 1

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -828,7 +828,7 @@ If installed, we now require:
 | pytest (dev)    | 4.0.2           |          |
 +-----------------+-----------------+----------+
 
-For `optional libraries <https://dev.pandas.io/install.html#dependencies>`_ the general recommendation is to use the latest version.
+For `optional libraries <https://dev.pandas.io/docs/install.html#dependencies>`_ the general recommendation is to use the latest version.
 The following table lists the lowest version per library that is currently being tested throughout the development of pandas.
 Optional libraries below the lowest tested version may still work, but are not considered supported.
 

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -109,6 +109,8 @@ Removal of prior version deprecations/changes
 - :meth:`pandas.Series.str.cat` does not accept list-likes *within* list-likes anymore (:issue:`27611`)
 - Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`)
 - Removed the previously deprecated ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`)
+- Ability to read pickles containing :class:`Categorical` instances created with pre-0.16 version of pandas has been removed (:issue:`27538`)
+-
 
 .. _whatsnew_1000.performance:
 
@@ -147,7 +149,7 @@ Datetimelike
 - Bug in :class:`Timestamp` subtraction when subtracting a :class:`Timestamp` from a ``np.datetime64`` object incorrectly raising ``TypeError`` (:issue:`28286`)
 - Addition and subtraction of integer or integer-dtype arrays with :class:`Timestamp` will now raise ``NullFrequencyError`` instead of ``ValueError`` (:issue:`28268`)
 - Bug in :class:`Series` and :class:`DataFrame` with integer dtype failing to raise ``TypeError`` when adding or subtracting a ``np.datetime64`` object (:issue:`28080`)
--
+- Bug in :class:`Week` with ``weekday`` incorrectly raising ``AttributeError`` instead of ``TypeError`` when adding or subtracting an invalid type (:issue:`28530`)
 
 
 Timedelta
@@ -218,6 +220,7 @@ I/O
 - Bug in :meth:`DataFrame.to_csv` where values were truncated when the length of ``na_rep`` was shorter than the text input data. (:issue:`25099`)
 - Bug in :func:`DataFrame.to_string` where values were truncated using display options instead of outputting the full content (:issue:`9784`)
 - Bug in :meth:`DataFrame.to_json` where a datetime column label would not be written out in ISO format with ``orient="table"`` (:issue:`28130`)
+- Bug in :func:`DataFrame.to_parquet` where writing to GCS would fail with `engine='fastparquet'` if the file did not already exist (:issue:`28326`)
 
 Plotting
 ^^^^^^^^

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1353,24 +1353,7 @@ def __setstate__(self, state):
         if not isinstance(state, dict):
             raise Exception("invalid pickle state")
 
-        # Provide compatibility with pre-0.15.0 Categoricals.
-        if "_categories" not in state and "_levels" in state:
-            state["_categories"] = self.dtype.validate_categories(state.pop("_levels"))
-        if "_codes" not in state and "labels" in state:
-            state["_codes"] = coerce_indexer_dtype(
-                state.pop("labels"), state["_categories"]
-            )
-
-        # 0.16.0 ordered change
-        if "_ordered" not in state:
-
-            # >=15.0 < 0.16.0
-            if "ordered" in state:
-                state["_ordered"] = state.pop("ordered")
-            else:
-                state["_ordered"] = False
-
-        # 0.21.0 CategoricalDtype change
+        # compat with pre 0.21.0 CategoricalDtype change
         if "_dtype" not in state:
             state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"])
 

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -7,7 +7,7 @@
 
 from pandas import DataFrame, get_option
 
-from pandas.io.common import get_filepath_or_buffer, is_s3_url
+from pandas.io.common import get_filepath_or_buffer, is_gcs_url, is_s3_url
 
 
 def get_engine(engine):
@@ -159,12 +159,12 @@ def write(
         if partition_cols is not None:
             kwargs["file_scheme"] = "hive"
 
-        if is_s3_url(path):
-            # path is s3:// so we need to open the s3file in 'wb' mode.
+        if is_s3_url(path) or is_gcs_url(path):
+            # if path is s3:// or gs:// we need to open the file in 'wb' mode.
             # TODO: Support 'ab'
 
             path, _, _, _ = get_filepath_or_buffer(path, mode="wb")
-            # And pass the opened s3file to the fastparquet internal impl.
+            # And pass the opened file to the fastparquet internal impl.
             kwargs["open_with"] = lambda path, _: path
         else:
             path, _, _, _ = get_filepath_or_buffer(path)

diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py
@@ -528,32 +528,33 @@ def test_as_array_datetime_tz(self):
         assert mgr.get("g").dtype == "datetime64[ns, CET]"
         assert mgr.as_array().dtype == "object"
 
-    def test_astype(self):
+    @pytest.mark.parametrize("t", ["float16", "float32", "float64", "int32", "int64"])
+    def test_astype(self, t):
         # coerce all
         mgr = create_mgr("c: f4; d: f2; e: f8")
-        for t in ["float16", "float32", "float64", "int32", "int64"]:
-            t = np.dtype(t)
-            tmgr = mgr.astype(t)
-            assert tmgr.get("c").dtype.type == t
-            assert tmgr.get("d").dtype.type == t
-            assert tmgr.get("e").dtype.type == t
+
+        t = np.dtype(t)
+        tmgr = mgr.astype(t)
+        assert tmgr.get("c").dtype.type == t
+        assert tmgr.get("d").dtype.type == t
+        assert tmgr.get("e").dtype.type == t
 
         # mixed
         mgr = create_mgr("a,b: object; c: bool; d: datetime; e: f4; f: f2; g: f8")
-        for t in ["float16", "float32", "float64", "int32", "int64"]:
-            t = np.dtype(t)
-            tmgr = mgr.astype(t, errors="ignore")
-            assert tmgr.get("c").dtype.type == t
-            assert tmgr.get("e").dtype.type == t
-            assert tmgr.get("f").dtype.type == t
-            assert tmgr.get("g").dtype.type == t
-
-            assert tmgr.get("a").dtype.type == np.object_
-            assert tmgr.get("b").dtype.type == np.object_
-            if t != np.int64:
-                assert tmgr.get("d").dtype.type == np.datetime64
-            else:
-                assert tmgr.get("d").dtype.type == t
+
+        t = np.dtype(t)
+        tmgr = mgr.astype(t, errors="ignore")
+        assert tmgr.get("c").dtype.type == t
+        assert tmgr.get("e").dtype.type == t
+        assert tmgr.get("f").dtype.type == t
+        assert tmgr.get("g").dtype.type == t
+
+        assert tmgr.get("a").dtype.type == np.object_
+        assert tmgr.get("b").dtype.type == np.object_
+        if t != np.int64:
+            assert tmgr.get("d").dtype.type == np.datetime64
+        else:
+            assert tmgr.get("d").dtype.type == t
 
     def test_convert(self):
         def _compare(old_mgr, new_mgr):

diff --git a/pandas/tests/io/data/categorical.0.25.0.pickle b/pandas/tests/io/data/categorical.0.25.0.pickle