Fix ruff PD901 and prefer sum over len+if (#4012)

* simplify: prefer sum over len + if astral-sh/ruff#13050 * fix ruff PD901 pandas-df-variable-name * fix cast df_energies as float in cp2k parse_energy_file
materialsproject · Aug 22, 2024 · 016f9de · 016f9de
1 parent 6ca78b3
commit 016f9de
Show file tree

Hide file tree

Showing 17 changed files with 96 additions and 107 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -178,23 +178,22 @@ output-format = "concise"
 select = ["ALL"]
 ignore = [
     # Rule families
-    "ANN",  # flake8-annotations (not ready, require types for ALL args)
-    "ARG",  # Check for unused function arguments
-    "BLE",  # General catch of Exception
-    "C90",  # Check for functions with a high McCabe complexity
-    "COM",  # flake8-commas (conflict with line wrapper)
-    "CPY",  # Missing copyright notice at top of file (need preview mode)
-    "EM",   # Format nice error messages
-    "ERA",  # Check for commented-out code
-    "FIX",  # Check for FIXME, TODO and other developer notes
-    "FURB", # refurb (need preview mode, too many preview errors)
-    "G",    # Validate logging format strings
-    "INP",  # Ban PEP-420 implicit namespace packages
-    "N",    # PEP8-naming (many var/arg names are intended)
-    "PTH",  # Prefer pathlib over os.path
-    "SLF",  # Access "private" class members
-    "T20",  # Check for print/pprint
-    "TD",   # TODO tags related
+    "ANN", # flake8-annotations (not ready, require types for ALL args)
+    "ARG", # Check for unused function arguments
+    "BLE", # General catch of Exception
+    "C90", # Check for functions with a high McCabe complexity
+    "COM", # flake8-commas (conflict with line wrapper)
+    "CPY", # Missing copyright notice at top of file (need preview mode)
+    "EM",  # Format nice error messages
+    "ERA", # Check for commented-out code
+    "FIX", # Check for FIXME, TODO and other developer notes
+    "G",   # Validate logging format strings
+    "INP", # Ban PEP-420 implicit namespace packages
+    "N",   # PEP8-naming (many var/arg names are intended)
+    "PTH", # Prefer pathlib over os.path
+    "SLF", # Access "private" class members
+    "T20", # Check for print/pprint
+    "TD",  # TODO tags related
 
     # Single rules
     "B023",    # Function definition does not bind loop variable
@@ -207,13 +206,13 @@ ignore = [
     "FBT001",  # Boolean-typed positional argument in function definition
     "FBT002",  # Boolean default positional argument in function
     "NPY201",  # TODO: enable after migration to NumPy 2.0
-    "PD901",   # pandas-df-variable-name
     "PERF203", # Use of try-except in for/while loop
     "PERF401", # Replace "for" loops with list comprehension
     "PLR0911", # Too many return statements
     "PLR0912", # Too many branches
     "PLR0913", # Too many arguments
     "PLR0915", # Too many statements
+    "PLR1702", # Too many nested blocks
     "PLR2004", # Magic-value-comparison TODO fix these
     "PLW2901", # Outer for loop variable overwritten by inner assignment target
     "PT013",   # Incorrect import of pytest

diff --git a/src/pymatgen/analysis/chemenv/utils/scripts_utils.py b/src/pymatgen/analysis/chemenv/utils/scripts_utils.py
@@ -129,10 +129,7 @@ def draw_cg(
                 faces = cg.faces(neighbors)
                 edges = cg.edges(neighbors)
             symbol = next(iter(site.species)).symbol
-            if faces_color_override:
-                color = faces_color_override
-            else:
-                color = [float(i) / 255 for i in vis.el_color_mapping[symbol]]
+            color = faces_color_override or [float(i) / 255 for i in vis.el_color_mapping[symbol]]
             vis.add_faces(faces, color, opacity=0.4)
             vis.add_edges(edges)
         if show_perfect:

diff --git a/src/pymatgen/analysis/functional_groups.py b/src/pymatgen/analysis/functional_groups.py
@@ -169,7 +169,7 @@ def get_special_carbon(self, elements=None):
 
             neighbor_spec = [str(self.species[n]) for n in neighbors]
 
-            ons = len([n for n in neighbor_spec if n in ["O", "N", "S"]])
+            ons = sum(n in ["O", "N", "S"] for n in neighbor_spec)
 
             if len(neighbors) == 4 and ons >= 2:
                 specials.add(node)

diff --git a/src/pymatgen/analysis/magnetism/heisenberg.py b/src/pymatgen/analysis/magnetism/heisenberg.py
@@ -770,32 +770,27 @@ def _do_screen(structures, energies):
             screened_structures (list): Sorted structures.
             screened_energies (list): Sorted energies.
         """
-        magmoms = [s.site_properties["magmom"] for s in structures]
-        n_below_1ub = [len([m for m in ms if abs(m) < 1]) for ms in magmoms]
-
-        df = pd.DataFrame(
-            {
-                "structure": structures,
-                "energy": energies,
-                "magmoms": magmoms,
-                "n_below_1ub": n_below_1ub,
-            }
+        magmoms = [struct.site_properties["magmom"] for struct in structures]
+        n_below_1ub = [sum(abs(m) < 1 for m in ms) for ms in magmoms]
+
+        df_mag = pd.DataFrame(
+            {"structure": structures, "energy": energies, "magmoms": magmoms, "n_below_1ub": n_below_1ub}
         )
 
         # keep the ground and first excited state fixed to capture the
         # low-energy spectrum
-        index = list(df.index)[2:]
-        df_high_energy = df.iloc[2:]
+        index = list(df_mag.index)[2:]
+        df_high_energy = df_mag.iloc[2:]
 
         # Prioritize structures with fewer magmoms < 1 uB
         df_high_energy = df_high_energy.sort_values(by="n_below_1ub")
 
         index = [0, 1, *df_high_energy.index]
 
         # sort
-        df = df.reindex(index)
-        screened_structures = list(df["structure"].values)
-        screened_energies = list(df["energy"].values)
+        df_mag = df_mag.reindex(index)
+        screened_structures = list(df_mag["structure"].values)
+        screened_energies = list(df_mag["energy"].values)
 
         return screened_structures, screened_energies
 

diff --git a/src/pymatgen/analysis/phase_diagram.py b/src/pymatgen/analysis/phase_diagram.py
@@ -3036,7 +3036,7 @@ def get_marker_props(coords, entries):
                     for el, axis in zip(self._pd.elements, range(self._dim), strict=True):
                         _cartesian_positions = [x, y, z]
                         _cartesian_positions[axis].append(entry.composition[el])
-                        label += f"<br> {el}: {round(entry.composition[el]/total_sum_el, 6)}"
+                        label += f"<br> {el}: {round(entry.composition[el] / total_sum_el, 6)}"
                 elif self._dim == 3 and self.ternary_style == "3d":
                     x.append(coord[0])
                     y.append(coord[1])
@@ -3047,7 +3047,7 @@ def get_marker_props(coords, entries):
                         entry.composition[el] for el, _axis in zip(self._pd.elements, range(self._dim), strict=True)
                     )
                     for el, _axis in zip(self._pd.elements, range(self._dim), strict=True):
-                        label += f"<br> {el}: {round(entry.composition[el]/total_sum_el, 6)}"
+                        label += f"<br> {el}: {round(entry.composition[el] / total_sum_el, 6)}"
                 elif self._dim == 4:
                     x.append(coord[0])
                     y.append(coord[1])
@@ -3058,7 +3058,7 @@ def get_marker_props(coords, entries):
                         entry.composition[el] for el, _axis in zip(self._pd.elements, range(self._dim), strict=True)
                     )
                     for el, _axis in zip(self._pd.elements, range(self._dim), strict=True):
-                        label += f"<br> {el}: {round(entry.composition[el]/total_sum_el, 6)}"
+                        label += f"<br> {el}: {round(entry.composition[el] / total_sum_el, 6)}"
                 else:
                     x.append(coord[0])
                     y.append(coord[1])

diff --git a/src/pymatgen/command_line/vampire_caller.py b/src/pymatgen/command_line/vampire_caller.py
@@ -381,13 +381,12 @@ def parse_stdout(vamp_stdout, n_mats: int) -> tuple:
         names = ["T", "m_total", *[f"m_{idx + 1}" for idx in range(n_mats)], "X_x", "X_y", "X_z", "X_m", "nan"]
 
         # Parsing vampire MC output
-        df = pd.read_csv(vamp_stdout, sep="\t", skiprows=9, header=None, names=names)
-        df = df.drop("nan", axis=1)
+        df_stdout = pd.read_csv(vamp_stdout, sep="\t", skiprows=9, header=None, names=names).drop("nan", axis=1)
 
-        parsed_out = df.to_json()
+        parsed_out = df_stdout.to_json()
 
         # Max of susceptibility <-> critical temp
-        critical_temp = df.iloc[df.X_m.idxmax()]["T"]
+        critical_temp = df_stdout.iloc[df_stdout.X_m.idxmax()]["T"]
 
         return parsed_out, critical_temp
 

diff --git a/src/pymatgen/core/lattice.py b/src/pymatgen/core/lattice.py
@@ -1744,7 +1744,7 @@ def get_integer_index(
     mi += 0  # converts -0 to 0
 
     def n_minus(index):
-        return len([h for h in index if h < 0])
+        return sum(h < 0 for h in index)
 
     if n_minus(mi) > n_minus(mi * -1):
         mi *= -1

diff --git a/src/pymatgen/core/structure.py b/src/pymatgen/core/structure.py
@@ -2806,10 +2806,10 @@ def as_dataframe(self) -> pd.DataFrame:
                 row.append(site.properties.get(key))
             data.append(row)
 
-        df = pd.DataFrame(data, columns=["Species", *"abcxyz", *prop_keys])
-        df.attrs["Reduced Formula"] = self.composition.reduced_formula
-        df.attrs["Lattice"] = self.lattice
-        return df
+        df_struct = pd.DataFrame(data, columns=["Species", *"abcxyz", *prop_keys])
+        df_struct.attrs["Reduced Formula"] = self.composition.reduced_formula
+        df_struct.attrs["Lattice"] = self.lattice
+        return df_struct
 
     @classmethod
     def from_dict(

diff --git a/src/pymatgen/core/surface.py b/src/pymatgen/core/surface.py
@@ -2114,7 +2114,7 @@ def math_lcm(a: int, b: int) -> int:
     transf_hkl = np.array([idx // divisor for idx in transf_hkl])
 
     # Get positive Miller index
-    if len([i for i in transf_hkl if i < 0]) > 1:
+    if sum(idx < 0 for idx in transf_hkl) > 1:
         transf_hkl *= -1
 
     return tuple(transf_hkl)

diff --git a/src/pymatgen/electronic_structure/cohp.py b/src/pymatgen/electronic_structure/cohp.py
@@ -193,7 +193,7 @@ def has_antibnd_states_below_efermi(
             limit (float): Only COHP higher than this value will be considered.
         """
         populations = self.cohp
-        n_energies_below_efermi = len([energy for energy in self.energies if energy <= self.efermi])
+        n_energies_below_efermi = sum(energy <= self.efermi for energy in self.energies)
 
         if populations is None:
             return None

diff --git a/src/pymatgen/entries/mixing_scheme.py b/src/pymatgen/entries/mixing_scheme.py
@@ -238,8 +238,8 @@ def process_entries(
                 processed_entry_list.append(entry)
 
         if verbose:
-            count_type_1 = len([e for e in processed_entry_list if e.parameters["run_type"] in self.valid_rtypes_1])
-            count_type_2 = len([e for e in processed_entry_list if e.parameters["run_type"] in self.valid_rtypes_2])
+            count_type_1 = sum(entry.parameters["run_type"] in self.valid_rtypes_1 for entry in processed_entry_list)
+            count_type_2 = sum(entry.parameters["run_type"] in self.valid_rtypes_2 for entry in processed_entry_list)
             print(
                 f"\nProcessing complete. Mixed entries contain {count_type_1} {self.run_type_1} and {count_type_2} "
                 f"{self.run_type_2} entries.\n"

diff --git a/src/pymatgen/io/cp2k/outputs.py b/src/pymatgen/io/cp2k/outputs.py
@@ -1636,12 +1636,12 @@ def parse_energy_file(energy_file):
         "conserved_quantity",
         "used_time",
     ]
-    df = pd.read_csv(energy_file, skiprows=1, names=columns, sep=r"\s+")
-    df["kinetic_energy"] = df["kinetic_energy"] * Ha_to_eV
-    df["potential_energy"] = df["potential_energy"] * Ha_to_eV
-    df["conserved_quantity"] = df["conserved_quantity"] * Ha_to_eV
-    df.astype(float)
-    return {c: df[c].to_numpy() for c in columns}
+    df_energies = pd.read_csv(energy_file, skiprows=1, names=columns, sep=r"\s+")
+    df_energies["kinetic_energy"] = df_energies["kinetic_energy"] * Ha_to_eV
+    df_energies["potential_energy"] = df_energies["potential_energy"] * Ha_to_eV
+    df_energies["conserved_quantity"] = df_energies["conserved_quantity"] * Ha_to_eV
+    df_energies = df_energies.astype(float)
+    return {c: df_energies[c].to_numpy() for c in columns}
 
 
 # TODO: The DOS file that CP2K outputs as of 2022.1 seems to have a lot of problems.

diff --git a/src/pymatgen/io/lammps/data.py b/src/pymatgen/io/lammps/data.py
@@ -512,8 +512,7 @@ def disassemble(
             unique_mids = np.unique(mids)
             data_by_mols = {}
             for k in unique_mids:
-                df = atoms_df[atoms_df["molecule-ID"] == k]
-                data_by_mols[k] = {"Atoms": df}
+                data_by_mols[k] = {"Atoms": atoms_df[atoms_df["molecule-ID"] == k]}
 
         masses = self.masses.copy()
         masses["label"] = atom_labels
@@ -665,51 +664,51 @@ def parse_section(sec_lines) -> tuple[str, pd.DataFrame]:
             kw = title_info[0].strip()
             str_io = StringIO("".join(sec_lines[2:]))  # skip the 2nd line
             if kw.endswith("Coeffs") and not kw.startswith("PairIJ"):
-                df_list = [
+                dfs = [
                     pd.read_csv(StringIO(line), header=None, comment="#", sep=r"\s+")
                     for line in sec_lines[2:]
                     if line.strip()
                 ]
-                df = pd.concat(df_list, ignore_index=True)
-                names = ["id"] + [f"coeff{i}" for i in range(1, df.shape[1])]
+                df_section = pd.concat(dfs, ignore_index=True)
+                names = ["id"] + [f"coeff{i}" for i in range(1, df_section.shape[1])]
             else:
-                df = pd.read_csv(str_io, header=None, comment="#", sep=r"\s+")
+                df_section = pd.read_csv(str_io, header=None, comment="#", sep=r"\s+")
                 if kw == "PairIJ Coeffs":
-                    names = ["id1", "id2"] + [f"coeff{i}" for i in range(1, df.shape[1] - 1)]
-                    df.index.name = None
+                    names = ["id1", "id2"] + [f"coeff{i}" for i in range(1, df_section.shape[1] - 1)]
+                    df_section.index.name = None
                 elif kw in SECTION_HEADERS:
                     names = ["id"] + SECTION_HEADERS[kw]
                 elif kw == "Atoms":
                     names = ["id"] + ATOMS_HEADERS[atom_style]
-                    if df.shape[1] == len(names):
+                    if df_section.shape[1] == len(names):
                         pass
-                    elif df.shape[1] == len(names) + 3:
+                    elif df_section.shape[1] == len(names) + 3:
                         names += ["nx", "ny", "nz"]
                     else:
                         raise ValueError(f"Format in Atoms section inconsistent with {atom_style=}")
                 else:
                     raise NotImplementedError(f"Parser for {kw} section not implemented")
-            df.columns = names
+            df_section.columns = names
             if sort_id:
                 sort_by = "id" if kw != "PairIJ Coeffs" else ["id1", "id2"]
-                df = df.sort_values(sort_by)
-            if "id" in df.columns:
-                df = df.set_index("id", drop=True)
-                df.index.name = None
-            return kw, df
+                df_section = df_section.sort_values(sort_by)
+            if "id" in df_section.columns:
+                df_section = df_section.set_index("id", drop=True)
+                df_section.index.name = None
+            return kw, df_section
 
         err_msg = "Bad LAMMPS data format where "
         body = {}
         seen_atoms = False
         for part in parts[1:]:
-            name, section = parse_section(part)
+            name, df_section = parse_section(part)
             if name == "Atoms":
                 seen_atoms = True
             if (
                 name in ["Velocities"] + SECTION_KEYWORDS["topology"] and not seen_atoms
             ):  # Atoms must appear earlier than these
                 raise RuntimeError(f"{err_msg}{name} section appears before Atoms section")
-            body[name] = section
+            body[name] = df_section
 
         err_msg += "Nos. of {} do not match between header and {} section"
         assert len(body["Masses"]) == header["types"]["atom"], err_msg.format("atom types", "Masses")
@@ -789,14 +788,14 @@ def from_ff_and_topologies(
 
         topology = {key: pd.DataFrame([]) for key, values in topo_labels.items() if len(values) > 0}
         for key in topology:
-            df = pd.DataFrame(np.concatenate(topo_collector[key]), columns=SECTION_HEADERS[key][1:])
-            df["type"] = list(map(ff.maps[key].get, topo_labels[key]))
-            if any(pd.isna(df["type"])):  # Throw away undefined topologies
+            df_topology = pd.DataFrame(np.concatenate(topo_collector[key]), columns=SECTION_HEADERS[key][1:])
+            df_topology["type"] = list(map(ff.maps[key].get, topo_labels[key]))
+            if any(pd.isna(df_topology["type"])):  # Throw away undefined topologies
                 warnings.warn(f"Undefined {key.lower()} detected and removed")
-                df = df.dropna(subset=["type"])
-                df = df.reset_index(drop=True)
-            df.index += 1
-            topology[key] = df[SECTION_HEADERS[key]]
+                df_topology = df_topology.dropna(subset=["type"])
+                df_topology = df_topology.reset_index(drop=True)
+            df_topology.index += 1
+            topology[key] = df_topology[SECTION_HEADERS[key]]
         topology = {key: values for key, values in topology.items() if not values.empty}
 
         items |= {"atoms": atoms, "velocities": velocities, "topology": topology}
@@ -1155,12 +1154,12 @@ def find_eq_types(label, section) -> list:
                 mapper[t] = i
 
         def process_data(data) -> pd.DataFrame:
-            df = pd.DataFrame(data)
-            assert self._is_valid(df), "Invalid coefficients with rows varying in length"
-            n, c = df.shape
-            df.columns = [f"coeff{i}" for i in range(1, c + 1)]
-            df.index = range(1, n + 1)
-            return df
+            df_coeffs = pd.DataFrame(data)
+            assert self._is_valid(df_coeffs), "Invalid coefficients with rows varying in length"
+            n, c = df_coeffs.shape
+            df_coeffs.columns = [f"coeff{i}" for i in range(1, c + 1)]
+            df_coeffs.index = range(1, n + 1)
+            return df_coeffs
 
         all_data = {kw: process_data(main_data)}
         if class2_data:
@@ -1383,15 +1382,15 @@ def parse_xyz(cls, filename: str | Path) -> pd.DataFrame:
             lines = file.readlines()
 
         str_io = StringIO("".join(lines[2:]))  # skip the 2nd line
-        df = pd.read_csv(
+        df_xyz = pd.read_csv(
             str_io,
             header=None,
             comment="#",
             sep=r"\s+",
             names=["atom", "x", "y", "z"],
         )
-        df.index += 1
-        return df
+        df_xyz.index += 1
+        return df_xyz
 
     @classmethod
     def from_files(cls, coordinate_file: str, list_of_numbers: list[int], *filenames: str) -> Self:

diff --git a/src/pymatgen/io/lammps/outputs.py b/src/pymatgen/io/lammps/outputs.py
@@ -174,14 +174,14 @@ def _parse_thermo(lines: list[str]) -> pd.DataFrame:
                 data["Step"] = int(step[1])
                 data |= {k: float(v) for k, v in re.findall(kv_pattern, "".join(ts[1:]))}
                 dicts.append(data)
-            df = pd.DataFrame(dicts)
+            df_thermo = pd.DataFrame(dicts)
             # rearrange the sequence of columns
             columns = ["Step"] + [k for k, v in re.findall(kv_pattern, "".join(time_steps[0][1:]))]
-            df = df[columns]
+            df_thermo = df_thermo[columns]
         # one line thermo data
         else:
-            df = pd.read_csv(StringIO("".join(lines)), sep=r"\s+")
-        return df
+            df_thermo = pd.read_csv(StringIO("".join(lines)), sep=r"\s+")
+        return df_thermo
 
     runs = []
     for b, e in zip(begins, ends, strict=True):