Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Build backend: Switch to custom glob-walkdir implementation #9013

Merged
merged 8 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ uv-distribution-types = { path = "crates/uv-distribution-types" }
uv-extract = { path = "crates/uv-extract" }
uv-fs = { path = "crates/uv-fs" }
uv-git = { path = "crates/uv-git" }
uv-globfilter = { path = "crates/uv-globfilter" }
uv-install-wheel = { path = "crates/uv-install-wheel", default-features = false }
uv-installer = { path = "crates/uv-installer" }
uv-macros = { path = "crates/uv-macros" }
Expand Down Expand Up @@ -133,6 +134,7 @@ quote = { version = "1.0.37" }
rayon = { version = "1.10.0" }
reflink-copy = { version = "0.1.19" }
regex = { version = "1.10.6" }
regex-automata = { version = "0.4.8", default-features = false, features = ["dfa-build", "dfa-search", "perf", "std", "syntax"] }
reqwest = { version = "0.12.7", default-features = false, features = ["json", "gzip", "stream", "rustls-tls", "rustls-tls-native-roots", "socks", "multipart", "http2"] }
reqwest-middleware = { version = "0.4.0", features = ["multipart"] }
reqwest-retry = { version = "0.7.0" }
Expand Down
2 changes: 1 addition & 1 deletion crates/uv-build-backend/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ doctest = false
[dependencies]
uv-distribution-filename = { workspace = true }
uv-fs = { workspace = true }
uv-globfilter = { workspace = true }
uv-normalize = { workspace = true }
uv-pep440 = { workspace = true }
uv-pep508 = { workspace = true }
Expand All @@ -24,7 +25,6 @@ uv-warnings = { workspace = true }
csv = { workspace = true }
flate2 = { workspace = true }
fs-err = { workspace = true }
glob = { workspace = true }
globset = { workspace = true }
itertools = { workspace = true }
serde = { workspace = true }
Expand Down
152 changes: 119 additions & 33 deletions crates/uv-build-backend/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
mod metadata;
mod pep639_glob;

use crate::metadata::{PyProjectToml, ValidationError};
use crate::pep639_glob::Pep639GlobError;
use flate2::write::GzEncoder;
use flate2::Compression;
use fs_err::File;
use glob::{GlobError, PatternError};
use globset::{Glob, GlobSetBuilder};
use globset::GlobSetBuilder;
use itertools::Itertools;
use sha2::{Digest, Sha256};
use std::fs::FileType;
Expand All @@ -19,6 +16,7 @@ use thiserror::Error;
use tracing::{debug, trace};
use uv_distribution_filename::{SourceDistExtension, SourceDistFilename, WheelFilename};
use uv_fs::Simplified;
use uv_globfilter::{parse_portable_glob, GlobDirFilter, PortableGlobError};
use walkdir::WalkDir;
use zip::{CompressionMethod, ZipWriter};

Expand All @@ -30,16 +28,26 @@ pub enum Error {
Toml(#[from] toml::de::Error),
#[error("Invalid pyproject.toml")]
Validation(#[from] ValidationError),
#[error("Invalid `project.license-files` glob expression: `{0}`")]
Pep639Glob(String, #[source] Pep639GlobError),
#[error("The `project.license-files` entry is not a valid glob pattern: `{0}`")]
Pattern(String, #[source] PatternError),
/// [`GlobError`] is a wrapped io error.
#[error(transparent)]
Glob(#[from] GlobError),
#[error("Unsupported glob expression in: `{field}`")]
PortableGlob {
field: String,
#[source]
source: PortableGlobError,
},
/// <https://github.com/BurntSushi/ripgrep/discussions/2927>
#[error("Glob expressions caused to large regex in: `{field}`")]
GlobSetTooLarge {
field: String,
#[source]
source: globset::Error,
},
/// [`globset::Error`] shows the glob that failed to parse.
#[error(transparent)]
GlobSet(#[from] globset::Error),
#[error("Unsupported glob expression in: `{field}`")]
GlobSet {
field: String,
#[source]
err: globset::Error,
},
#[error("Failed to walk source tree: `{}`", root.user_display())]
WalkDir {
root: PathBuf,
Expand Down Expand Up @@ -322,7 +330,10 @@ pub fn build_wheel(
err,
})?;

let relative_path = entry.path().strip_prefix(&strip_root)?;
let relative_path = entry
.path()
.strip_prefix(&strip_root)
.expect("walkdir starts with root");
let relative_path_str = relative_path
.to_str()
.ok_or_else(|| Error::NotUtf8Path(relative_path.to_path_buf()))?;
Expand Down Expand Up @@ -354,10 +365,52 @@ pub fn build_wheel(
Ok(filename)
}

/// TODO(konsti): Wire this up with actual settings and remove this struct.
///
/// To select which files to include in the source distribution, we first add the includes, then
/// remove the excludes from that.
pub struct SourceDistSettings {
/// Glob expressions which files and directories to include in the source distribution.
///
/// Includes are anchored, which means that `pyproject.toml` includes only
/// `<project root>/pyproject.toml`. Use for example `assets/**/sample.csv` to include for all
/// `sample.csv` files in `<project root>/assets` or any child directory. To recursively include
/// all files under a directory, use a `/**` suffix, e.g. `src/**`. For performance and
/// reproducibility, avoid unanchored matches such as `**/sample.csv`.
///
/// The glob syntax is the reduced portable glob from
/// [PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).
include: Vec<String>,
/// Glob expressions which files and directories to exclude from the previous source
/// distribution includes.
///
/// Excludes are not anchored, which means that `__pycache__` excludes all directories named
/// `__pycache__` and it's children anywhere. To anchor a directory, use a `/` prefix, e.g.,
/// `/dist` will exclude only `<project root>/dist`.
///
/// The glob syntax is the reduced portable glob from
/// [PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).
exclude: Vec<String>,
}

impl Default for SourceDistSettings {
fn default() -> Self {
Self {
include: vec!["src/**".to_string(), "pyproject.toml".to_string()],
exclude: vec![
"__pycache__".to_string(),
"*.pyc".to_string(),
"*.pyo".to_string(),
],
}
}
}

/// Build a source distribution from the source tree and place it in the output directory.
pub fn build_source_dist(
source_tree: &Path,
source_dist_directory: &Path,
settings: SourceDistSettings,
uv_version: &str,
) -> Result<SourceDistFilename, Error> {
let contents = fs_err::read_to_string(source_tree.join("pyproject.toml"))?;
Expand Down Expand Up @@ -392,42 +445,75 @@ pub fn build_source_dist(
)
.map_err(|err| Error::TarWrite(source_dist_path.clone(), err))?;

let includes = ["src/**/*", "pyproject.toml"];
let mut include_builder = GlobSetBuilder::new();
for include in includes {
include_builder.add(Glob::new(include)?);
let mut include_globs = Vec::new();
for include in settings.include {
let glob = parse_portable_glob(&include).map_err(|err| Error::PortableGlob {
field: "tool.uv.source-dist.include".to_string(),
source: err,
})?;
include_globs.push(glob.clone());
}
let include_matcher = include_builder.build()?;
let include_matcher =
GlobDirFilter::from_globs(&include_globs).map_err(|err| Error::GlobSetTooLarge {
field: "tool.uv.source-dist.include".to_string(),
source: err,
})?;

let excludes = ["__pycache__", "*.pyc", "*.pyo"];
let mut exclude_builder = GlobSetBuilder::new();
for exclude in excludes {
exclude_builder.add(Glob::new(exclude)?);
for exclude in settings.exclude {
// Excludes are unanchored
let exclude = if let Some(exclude) = exclude.strip_prefix("/") {
exclude.to_string()
} else {
format!("**/{exclude}").to_string()
};
let glob = parse_portable_glob(&exclude).map_err(|err| Error::PortableGlob {
field: "tool.uv.source-dist.exclude".to_string(),
source: err,
})?;
exclude_builder.add(glob);
}
let exclude_matcher = exclude_builder.build()?;
let exclude_matcher = exclude_builder
.build()
.map_err(|err| Error::GlobSetTooLarge {
field: "tool.uv.source-dist.exclude".to_string(),
source: err,
})?;

// TODO(konsti): Add files linked by pyproject.toml

for file in WalkDir::new(source_tree).into_iter().filter_entry(|dir| {
let relative = dir
for entry in WalkDir::new(source_tree).into_iter().filter_entry(|entry| {
// TODO(konsti): This should be prettier.
let relative = entry
.path()
.strip_prefix(source_tree)
.expect("walkdir starts with root");
// TODO(konsti): Also check that we're matching at least a prefix of an include matcher.
!exclude_matcher.is_match(relative)
.expect("walkdir starts with root")
.to_path_buf();

// Fast path: Don't descend into a directory that can't be included. This is the most
// important performance optimization, it avoids descending into directories such as
// `.venv`. While walkdir is generally cheap, we still avoid traversing large data
// directories that often exist on the top level of a project. This is especially noticeable
// on network file systems with high latencies per operation (while contiguous reading may
// still be fast).
include_matcher.match_directory(&relative) && !exclude_matcher.is_match(&relative)
}) {
let entry = file.map_err(|err| Error::WalkDir {
let entry = entry.map_err(|err| Error::WalkDir {
root: source_tree.to_path_buf(),
err,
})?;
// TODO(konsti): This should be prettier.
let relative = entry
.path()
.strip_prefix(source_tree)
.expect("walkdir starts with root");
if !include_matcher.is_match(relative) {
.expect("walkdir starts with root")
.to_path_buf();

if !include_matcher.match_path(&relative) || exclude_matcher.is_match(&relative) {
trace!("Excluding {}", relative.user_display());
continue;
}
};

debug!("Including {}", relative.user_display());

let metadata = fs_err::metadata(entry.path())?;
Expand Down Expand Up @@ -462,7 +548,7 @@ pub fn build_source_dist(
.map_err(|err| Error::TarWrite(source_dist_path.clone(), err))?;
} else {
return Err(Error::UnsupportedFileType(
relative.to_path_buf(),
relative.clone(),
entry.file_type(),
));
}
Expand Down
56 changes: 42 additions & 14 deletions crates/uv-build-backend/src/metadata.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
use crate::pep639_glob::parse_pep639_glob;
use crate::Error;
use globset::{Glob, GlobSetBuilder};
use itertools::Itertools;
use serde::Deserialize;
use std::collections::{BTreeMap, Bound};
use std::ffi::OsStr;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use tracing::debug;
use tracing::{debug, trace};
use uv_fs::Simplified;
use uv_globfilter::parse_portable_glob;
use uv_normalize::{ExtraName, PackageName};
use uv_pep440::{Version, VersionSpecifiers};
use uv_pep508::{Requirement, VersionOrUrl};
use uv_pypi_types::{Metadata23, VerbatimParsedUrl};
use uv_warnings::warn_user_once;
use version_ranges::Ranges;
use walkdir::WalkDir;

#[derive(Debug, Error)]
pub enum ValidationError {
Expand Down Expand Up @@ -312,27 +314,53 @@ impl PyProjectToml {
};

let mut license_files = Vec::new();
let mut license_glob_builder = GlobSetBuilder::new();
for license_glob in license_globs {
let pep639_glob = parse_pep639_glob(license_glob)
.map_err(|err| Error::Pep639Glob(license_glob.to_string(), err))?;
let absolute_glob = PathBuf::from(glob::Pattern::escape(
let pep639_glob =
parse_portable_glob(license_glob).map_err(|err| Error::PortableGlob {
field: license_glob.to_string(),
source: err,
})?;
let absolute_glob = PathBuf::from(globset::escape(
root.simplified().to_string_lossy().as_ref(),
))
.join(pep639_glob.to_string())
.to_string_lossy()
.to_string();
for license_file in glob::glob(&absolute_glob)
.map_err(|err| Error::Pattern(absolute_glob.to_string(), err))?
{
let license_file = license_file
.map_err(Error::Glob)?
.to_string_lossy()
.to_string();
if !license_files.contains(&license_file) {
license_files.push(license_file);
license_glob_builder.add(Glob::new(&absolute_glob).map_err(|err| {
Error::GlobSet {
field: "project.license-files".to_string(),
err,
}
})?);
}
let license_globs = license_glob_builder.build().map_err(|err| Error::GlobSet {
field: "project.license-files".to_string(),
err,
})?;

for entry in WalkDir::new(".") {
let entry = entry.map_err(|err| Error::WalkDir {
root: PathBuf::from("."),
err,
})?;
let relative = entry
.path()
.strip_prefix("./")
.expect("walkdir starts with root");
if !license_globs.is_match(relative) {
trace!("Not a license files match: `{}`", relative.user_display());
continue;
}

debug!("License files match: `{}`", relative.user_display());
let license_file = relative.to_string_lossy().to_string();

if !license_files.contains(&license_file) {
license_files.push(license_file);
}
}

// The glob order may be unstable
license_files.sort();

Expand Down
Loading
Loading