Skip to content

Commit

Permalink
Fix ruff PD901 and prefer sum over len+if (#4012)
Browse files Browse the repository at this point in the history
* simplify: prefer sum over len + if

astral-sh/ruff#13050

* fix ruff PD901 pandas-df-variable-name

* fix cast df_energies as float in cp2k parse_energy_file
  • Loading branch information
janosh authored Aug 22, 2024
1 parent 6ca78b3 commit 016f9de
Show file tree
Hide file tree
Showing 17 changed files with 96 additions and 107 deletions.
35 changes: 17 additions & 18 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -178,23 +178,22 @@ output-format = "concise"
select = ["ALL"]
ignore = [
# Rule families
"ANN", # flake8-annotations (not ready, require types for ALL args)
"ARG", # Check for unused function arguments
"BLE", # General catch of Exception
"C90", # Check for functions with a high McCabe complexity
"COM", # flake8-commas (conflict with line wrapper)
"CPY", # Missing copyright notice at top of file (need preview mode)
"EM", # Format nice error messages
"ERA", # Check for commented-out code
"FIX", # Check for FIXME, TODO and other developer notes
"FURB", # refurb (need preview mode, too many preview errors)
"G", # Validate logging format strings
"INP", # Ban PEP-420 implicit namespace packages
"N", # PEP8-naming (many var/arg names are intended)
"PTH", # Prefer pathlib over os.path
"SLF", # Access "private" class members
"T20", # Check for print/pprint
"TD", # TODO tags related
"ANN", # flake8-annotations (not ready, require types for ALL args)
"ARG", # Check for unused function arguments
"BLE", # General catch of Exception
"C90", # Check for functions with a high McCabe complexity
"COM", # flake8-commas (conflict with line wrapper)
"CPY", # Missing copyright notice at top of file (need preview mode)
"EM", # Format nice error messages
"ERA", # Check for commented-out code
"FIX", # Check for FIXME, TODO and other developer notes
"G", # Validate logging format strings
"INP", # Ban PEP-420 implicit namespace packages
"N", # PEP8-naming (many var/arg names are intended)
"PTH", # Prefer pathlib over os.path
"SLF", # Access "private" class members
"T20", # Check for print/pprint
"TD", # TODO tags related

# Single rules
"B023", # Function definition does not bind loop variable
Expand All @@ -207,13 +206,13 @@ ignore = [
"FBT001", # Boolean-typed positional argument in function definition
"FBT002", # Boolean default positional argument in function
"NPY201", # TODO: enable after migration to NumPy 2.0
"PD901", # pandas-df-variable-name
"PERF203", # Use of try-except in for/while loop
"PERF401", # Replace "for" loops with list comprehension
"PLR0911", # Too many return statements
"PLR0912", # Too many branches
"PLR0913", # Too many arguments
"PLR0915", # Too many statements
"PLR1702", # Too many nested blocks
"PLR2004", # Magic-value-comparison TODO fix these
"PLW2901", # Outer for loop variable overwritten by inner assignment target
"PT013", # Incorrect import of pytest
Expand Down
5 changes: 1 addition & 4 deletions src/pymatgen/analysis/chemenv/utils/scripts_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,7 @@ def draw_cg(
faces = cg.faces(neighbors)
edges = cg.edges(neighbors)
symbol = next(iter(site.species)).symbol
if faces_color_override:
color = faces_color_override
else:
color = [float(i) / 255 for i in vis.el_color_mapping[symbol]]
color = faces_color_override or [float(i) / 255 for i in vis.el_color_mapping[symbol]]
vis.add_faces(faces, color, opacity=0.4)
vis.add_edges(edges)
if show_perfect:
Expand Down
2 changes: 1 addition & 1 deletion src/pymatgen/analysis/functional_groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def get_special_carbon(self, elements=None):

neighbor_spec = [str(self.species[n]) for n in neighbors]

ons = len([n for n in neighbor_spec if n in ["O", "N", "S"]])
ons = sum(n in ["O", "N", "S"] for n in neighbor_spec)

if len(neighbors) == 4 and ons >= 2:
specials.add(node)
Expand Down
25 changes: 10 additions & 15 deletions src/pymatgen/analysis/magnetism/heisenberg.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,32 +770,27 @@ def _do_screen(structures, energies):
screened_structures (list): Sorted structures.
screened_energies (list): Sorted energies.
"""
magmoms = [s.site_properties["magmom"] for s in structures]
n_below_1ub = [len([m for m in ms if abs(m) < 1]) for ms in magmoms]

df = pd.DataFrame(
{
"structure": structures,
"energy": energies,
"magmoms": magmoms,
"n_below_1ub": n_below_1ub,
}
magmoms = [struct.site_properties["magmom"] for struct in structures]
n_below_1ub = [sum(abs(m) < 1 for m in ms) for ms in magmoms]

df_mag = pd.DataFrame(
{"structure": structures, "energy": energies, "magmoms": magmoms, "n_below_1ub": n_below_1ub}
)

# keep the ground and first excited state fixed to capture the
# low-energy spectrum
index = list(df.index)[2:]
df_high_energy = df.iloc[2:]
index = list(df_mag.index)[2:]
df_high_energy = df_mag.iloc[2:]

# Prioritize structures with fewer magmoms < 1 uB
df_high_energy = df_high_energy.sort_values(by="n_below_1ub")

index = [0, 1, *df_high_energy.index]

# sort
df = df.reindex(index)
screened_structures = list(df["structure"].values)
screened_energies = list(df["energy"].values)
df_mag = df_mag.reindex(index)
screened_structures = list(df_mag["structure"].values)
screened_energies = list(df_mag["energy"].values)

return screened_structures, screened_energies

Expand Down
6 changes: 3 additions & 3 deletions src/pymatgen/analysis/phase_diagram.py
Original file line number Diff line number Diff line change
Expand Up @@ -3036,7 +3036,7 @@ def get_marker_props(coords, entries):
for el, axis in zip(self._pd.elements, range(self._dim), strict=True):
_cartesian_positions = [x, y, z]
_cartesian_positions[axis].append(entry.composition[el])
label += f"<br> {el}: {round(entry.composition[el]/total_sum_el, 6)}"
label += f"<br> {el}: {round(entry.composition[el] / total_sum_el, 6)}"
elif self._dim == 3 and self.ternary_style == "3d":
x.append(coord[0])
y.append(coord[1])
Expand All @@ -3047,7 +3047,7 @@ def get_marker_props(coords, entries):
entry.composition[el] for el, _axis in zip(self._pd.elements, range(self._dim), strict=True)
)
for el, _axis in zip(self._pd.elements, range(self._dim), strict=True):
label += f"<br> {el}: {round(entry.composition[el]/total_sum_el, 6)}"
label += f"<br> {el}: {round(entry.composition[el] / total_sum_el, 6)}"
elif self._dim == 4:
x.append(coord[0])
y.append(coord[1])
Expand All @@ -3058,7 +3058,7 @@ def get_marker_props(coords, entries):
entry.composition[el] for el, _axis in zip(self._pd.elements, range(self._dim), strict=True)
)
for el, _axis in zip(self._pd.elements, range(self._dim), strict=True):
label += f"<br> {el}: {round(entry.composition[el]/total_sum_el, 6)}"
label += f"<br> {el}: {round(entry.composition[el] / total_sum_el, 6)}"
else:
x.append(coord[0])
y.append(coord[1])
Expand Down
7 changes: 3 additions & 4 deletions src/pymatgen/command_line/vampire_caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,13 +381,12 @@ def parse_stdout(vamp_stdout, n_mats: int) -> tuple:
names = ["T", "m_total", *[f"m_{idx + 1}" for idx in range(n_mats)], "X_x", "X_y", "X_z", "X_m", "nan"]

# Parsing vampire MC output
df = pd.read_csv(vamp_stdout, sep="\t", skiprows=9, header=None, names=names)
df = df.drop("nan", axis=1)
df_stdout = pd.read_csv(vamp_stdout, sep="\t", skiprows=9, header=None, names=names).drop("nan", axis=1)

parsed_out = df.to_json()
parsed_out = df_stdout.to_json()

# Max of susceptibility <-> critical temp
critical_temp = df.iloc[df.X_m.idxmax()]["T"]
critical_temp = df_stdout.iloc[df_stdout.X_m.idxmax()]["T"]

return parsed_out, critical_temp

Expand Down
2 changes: 1 addition & 1 deletion src/pymatgen/core/lattice.py
Original file line number Diff line number Diff line change
Expand Up @@ -1744,7 +1744,7 @@ def get_integer_index(
mi += 0 # converts -0 to 0

def n_minus(index):
return len([h for h in index if h < 0])
return sum(h < 0 for h in index)

if n_minus(mi) > n_minus(mi * -1):
mi *= -1
Expand Down
8 changes: 4 additions & 4 deletions src/pymatgen/core/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -2806,10 +2806,10 @@ def as_dataframe(self) -> pd.DataFrame:
row.append(site.properties.get(key))
data.append(row)

df = pd.DataFrame(data, columns=["Species", *"abcxyz", *prop_keys])
df.attrs["Reduced Formula"] = self.composition.reduced_formula
df.attrs["Lattice"] = self.lattice
return df
df_struct = pd.DataFrame(data, columns=["Species", *"abcxyz", *prop_keys])
df_struct.attrs["Reduced Formula"] = self.composition.reduced_formula
df_struct.attrs["Lattice"] = self.lattice
return df_struct

@classmethod
def from_dict(
Expand Down
2 changes: 1 addition & 1 deletion src/pymatgen/core/surface.py
Original file line number Diff line number Diff line change
Expand Up @@ -2114,7 +2114,7 @@ def math_lcm(a: int, b: int) -> int:
transf_hkl = np.array([idx // divisor for idx in transf_hkl])

# Get positive Miller index
if len([i for i in transf_hkl if i < 0]) > 1:
if sum(idx < 0 for idx in transf_hkl) > 1:
transf_hkl *= -1

return tuple(transf_hkl)
Expand Down
2 changes: 1 addition & 1 deletion src/pymatgen/electronic_structure/cohp.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def has_antibnd_states_below_efermi(
limit (float): Only COHP higher than this value will be considered.
"""
populations = self.cohp
n_energies_below_efermi = len([energy for energy in self.energies if energy <= self.efermi])
n_energies_below_efermi = sum(energy <= self.efermi for energy in self.energies)

if populations is None:
return None
Expand Down
4 changes: 2 additions & 2 deletions src/pymatgen/entries/mixing_scheme.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,8 +238,8 @@ def process_entries(
processed_entry_list.append(entry)

if verbose:
count_type_1 = len([e for e in processed_entry_list if e.parameters["run_type"] in self.valid_rtypes_1])
count_type_2 = len([e for e in processed_entry_list if e.parameters["run_type"] in self.valid_rtypes_2])
count_type_1 = sum(entry.parameters["run_type"] in self.valid_rtypes_1 for entry in processed_entry_list)
count_type_2 = sum(entry.parameters["run_type"] in self.valid_rtypes_2 for entry in processed_entry_list)
print(
f"\nProcessing complete. Mixed entries contain {count_type_1} {self.run_type_1} and {count_type_2} "
f"{self.run_type_2} entries.\n"
Expand Down
12 changes: 6 additions & 6 deletions src/pymatgen/io/cp2k/outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1636,12 +1636,12 @@ def parse_energy_file(energy_file):
"conserved_quantity",
"used_time",
]
df = pd.read_csv(energy_file, skiprows=1, names=columns, sep=r"\s+")
df["kinetic_energy"] = df["kinetic_energy"] * Ha_to_eV
df["potential_energy"] = df["potential_energy"] * Ha_to_eV
df["conserved_quantity"] = df["conserved_quantity"] * Ha_to_eV
df.astype(float)
return {c: df[c].to_numpy() for c in columns}
df_energies = pd.read_csv(energy_file, skiprows=1, names=columns, sep=r"\s+")
df_energies["kinetic_energy"] = df_energies["kinetic_energy"] * Ha_to_eV
df_energies["potential_energy"] = df_energies["potential_energy"] * Ha_to_eV
df_energies["conserved_quantity"] = df_energies["conserved_quantity"] * Ha_to_eV
df_energies = df_energies.astype(float)
return {c: df_energies[c].to_numpy() for c in columns}


# TODO: The DOS file that CP2K outputs as of 2022.1 seems to have a lot of problems.
Expand Down
67 changes: 33 additions & 34 deletions src/pymatgen/io/lammps/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,8 +512,7 @@ def disassemble(
unique_mids = np.unique(mids)
data_by_mols = {}
for k in unique_mids:
df = atoms_df[atoms_df["molecule-ID"] == k]
data_by_mols[k] = {"Atoms": df}
data_by_mols[k] = {"Atoms": atoms_df[atoms_df["molecule-ID"] == k]}

masses = self.masses.copy()
masses["label"] = atom_labels
Expand Down Expand Up @@ -665,51 +664,51 @@ def parse_section(sec_lines) -> tuple[str, pd.DataFrame]:
kw = title_info[0].strip()
str_io = StringIO("".join(sec_lines[2:])) # skip the 2nd line
if kw.endswith("Coeffs") and not kw.startswith("PairIJ"):
df_list = [
dfs = [
pd.read_csv(StringIO(line), header=None, comment="#", sep=r"\s+")
for line in sec_lines[2:]
if line.strip()
]
df = pd.concat(df_list, ignore_index=True)
names = ["id"] + [f"coeff{i}" for i in range(1, df.shape[1])]
df_section = pd.concat(dfs, ignore_index=True)
names = ["id"] + [f"coeff{i}" for i in range(1, df_section.shape[1])]
else:
df = pd.read_csv(str_io, header=None, comment="#", sep=r"\s+")
df_section = pd.read_csv(str_io, header=None, comment="#", sep=r"\s+")
if kw == "PairIJ Coeffs":
names = ["id1", "id2"] + [f"coeff{i}" for i in range(1, df.shape[1] - 1)]
df.index.name = None
names = ["id1", "id2"] + [f"coeff{i}" for i in range(1, df_section.shape[1] - 1)]
df_section.index.name = None
elif kw in SECTION_HEADERS:
names = ["id"] + SECTION_HEADERS[kw]
elif kw == "Atoms":
names = ["id"] + ATOMS_HEADERS[atom_style]
if df.shape[1] == len(names):
if df_section.shape[1] == len(names):
pass
elif df.shape[1] == len(names) + 3:
elif df_section.shape[1] == len(names) + 3:
names += ["nx", "ny", "nz"]
else:
raise ValueError(f"Format in Atoms section inconsistent with {atom_style=}")
else:
raise NotImplementedError(f"Parser for {kw} section not implemented")
df.columns = names
df_section.columns = names
if sort_id:
sort_by = "id" if kw != "PairIJ Coeffs" else ["id1", "id2"]
df = df.sort_values(sort_by)
if "id" in df.columns:
df = df.set_index("id", drop=True)
df.index.name = None
return kw, df
df_section = df_section.sort_values(sort_by)
if "id" in df_section.columns:
df_section = df_section.set_index("id", drop=True)
df_section.index.name = None
return kw, df_section

err_msg = "Bad LAMMPS data format where "
body = {}
seen_atoms = False
for part in parts[1:]:
name, section = parse_section(part)
name, df_section = parse_section(part)
if name == "Atoms":
seen_atoms = True
if (
name in ["Velocities"] + SECTION_KEYWORDS["topology"] and not seen_atoms
): # Atoms must appear earlier than these
raise RuntimeError(f"{err_msg}{name} section appears before Atoms section")
body[name] = section
body[name] = df_section

err_msg += "Nos. of {} do not match between header and {} section"
assert len(body["Masses"]) == header["types"]["atom"], err_msg.format("atom types", "Masses")
Expand Down Expand Up @@ -789,14 +788,14 @@ def from_ff_and_topologies(

topology = {key: pd.DataFrame([]) for key, values in topo_labels.items() if len(values) > 0}
for key in topology:
df = pd.DataFrame(np.concatenate(topo_collector[key]), columns=SECTION_HEADERS[key][1:])
df["type"] = list(map(ff.maps[key].get, topo_labels[key]))
if any(pd.isna(df["type"])): # Throw away undefined topologies
df_topology = pd.DataFrame(np.concatenate(topo_collector[key]), columns=SECTION_HEADERS[key][1:])
df_topology["type"] = list(map(ff.maps[key].get, topo_labels[key]))
if any(pd.isna(df_topology["type"])): # Throw away undefined topologies
warnings.warn(f"Undefined {key.lower()} detected and removed")
df = df.dropna(subset=["type"])
df = df.reset_index(drop=True)
df.index += 1
topology[key] = df[SECTION_HEADERS[key]]
df_topology = df_topology.dropna(subset=["type"])
df_topology = df_topology.reset_index(drop=True)
df_topology.index += 1
topology[key] = df_topology[SECTION_HEADERS[key]]
topology = {key: values for key, values in topology.items() if not values.empty}

items |= {"atoms": atoms, "velocities": velocities, "topology": topology}
Expand Down Expand Up @@ -1155,12 +1154,12 @@ def find_eq_types(label, section) -> list:
mapper[t] = i

def process_data(data) -> pd.DataFrame:
df = pd.DataFrame(data)
assert self._is_valid(df), "Invalid coefficients with rows varying in length"
n, c = df.shape
df.columns = [f"coeff{i}" for i in range(1, c + 1)]
df.index = range(1, n + 1)
return df
df_coeffs = pd.DataFrame(data)
assert self._is_valid(df_coeffs), "Invalid coefficients with rows varying in length"
n, c = df_coeffs.shape
df_coeffs.columns = [f"coeff{i}" for i in range(1, c + 1)]
df_coeffs.index = range(1, n + 1)
return df_coeffs

all_data = {kw: process_data(main_data)}
if class2_data:
Expand Down Expand Up @@ -1383,15 +1382,15 @@ def parse_xyz(cls, filename: str | Path) -> pd.DataFrame:
lines = file.readlines()

str_io = StringIO("".join(lines[2:])) # skip the 2nd line
df = pd.read_csv(
df_xyz = pd.read_csv(
str_io,
header=None,
comment="#",
sep=r"\s+",
names=["atom", "x", "y", "z"],
)
df.index += 1
return df
df_xyz.index += 1
return df_xyz

@classmethod
def from_files(cls, coordinate_file: str, list_of_numbers: list[int], *filenames: str) -> Self:
Expand Down
8 changes: 4 additions & 4 deletions src/pymatgen/io/lammps/outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,14 +174,14 @@ def _parse_thermo(lines: list[str]) -> pd.DataFrame:
data["Step"] = int(step[1])
data |= {k: float(v) for k, v in re.findall(kv_pattern, "".join(ts[1:]))}
dicts.append(data)
df = pd.DataFrame(dicts)
df_thermo = pd.DataFrame(dicts)
# rearrange the sequence of columns
columns = ["Step"] + [k for k, v in re.findall(kv_pattern, "".join(time_steps[0][1:]))]
df = df[columns]
df_thermo = df_thermo[columns]
# one line thermo data
else:
df = pd.read_csv(StringIO("".join(lines)), sep=r"\s+")
return df
df_thermo = pd.read_csv(StringIO("".join(lines)), sep=r"\s+")
return df_thermo

runs = []
for b, e in zip(begins, ends, strict=True):
Expand Down
Loading

0 comments on commit 016f9de

Please sign in to comment.