Skip to content

Commit

Permalink
Custom globbing impl
Browse files Browse the repository at this point in the history
  • Loading branch information
konstin committed Nov 13, 2024
1 parent 489d42a commit ddde029
Show file tree
Hide file tree
Showing 17 changed files with 786 additions and 191 deletions.
19 changes: 17 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 5 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ uv-distribution-types = { path = "crates/uv-distribution-types" }
uv-extract = { path = "crates/uv-extract" }
uv-fs = { path = "crates/uv-fs" }
uv-git = { path = "crates/uv-git" }
uv-globfilter = { path = "crates/uv-globfilter" }
uv-install-wheel = { path = "crates/uv-install-wheel", default-features = false }
uv-installer = { path = "crates/uv-installer" }
uv-macros = { path = "crates/uv-macros" }
Expand Down Expand Up @@ -80,6 +81,7 @@ backoff = { version = "0.4.0" }
base64 = { version = "0.22.1" }
bitflags = { version = "2.6.0" }
boxcar = { version = "0.2.5" }
bstr = { version = "1.10.0", default-features = false, features = ["std"] }
bytecheck = { version = "0.8.0" }
cachedir = { version = "0.3.1" }
cargo-util = { version = "0.2.14" }
Expand Down Expand Up @@ -125,14 +127,14 @@ path-slash = { version = "0.2.1" }
pathdiff = { version = "0.2.1" }
petgraph = { version = "0.6.5" }
platform-info = { version = "2.0.3" }
procfs = { version = "0.17.0", default-features = false, features = ["flate2"] }
proc-macro2 = { version = "1.0.86" }
procfs = { version = "0.17.0", default-features = false, features = ["flate2"] }
pubgrub = { git = "https://github.com/astral-sh/pubgrub", rev = "95e1390399cdddee986b658be19587eb1fdb2d79" }
version-ranges = { git = "https://github.com/astral-sh/pubgrub", rev = "95e1390399cdddee986b658be19587eb1fdb2d79" }
quote = { version = "1.0.37" }
rayon = { version = "1.10.0" }
reflink-copy = { version = "0.1.19" }
regex = { version = "1.10.6" }
regex-automata = { version = "0.4.8", default-features = false, features = ["dfa-build", "dfa-search", "perf", "std", "syntax"] }
reqwest = { version = "0.12.7", default-features = false, features = ["json", "gzip", "stream", "rustls-tls", "rustls-tls-native-roots", "socks", "multipart", "http2"] }
reqwest-middleware = { git = "https://github.com/TrueLayer/reqwest-middleware", rev = "d95ec5a99fcc9a4339e1850d40378bbfe55ab121", features = ["multipart"] }
reqwest-retry = { git = "https://github.com/TrueLayer/reqwest-middleware", rev = "d95ec5a99fcc9a4339e1850d40378bbfe55ab121" }
Expand Down Expand Up @@ -172,6 +174,7 @@ unicode-width = { version = "0.1.13" }
unscanny = { version = "0.1.0" }
url = { version = "2.5.2" }
urlencoding = { version = "2.1.3" }
version-ranges = { git = "https://github.com/astral-sh/pubgrub", rev = "95e1390399cdddee986b658be19587eb1fdb2d79" }
walkdir = { version = "2.5.0" }
which = { version = "7.0.0", features = ["regex"] }
windows-registry = { version = "0.3.0" }
Expand Down
2 changes: 1 addition & 1 deletion crates/uv-build-backend/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ doctest = false
[dependencies]
uv-distribution-filename = { workspace = true }
uv-fs = { workspace = true }
uv-globfilter = { workspace = true }
uv-normalize = { workspace = true }
uv-pep440 = { workspace = true }
uv-pep508 = { workspace = true }
Expand All @@ -24,7 +25,6 @@ uv-warnings = { workspace = true }
csv = { workspace = true }
flate2 = { workspace = true }
fs-err = { workspace = true }
glob = { workspace = true }
globset = { workspace = true }
itertools = { workspace = true }
serde = { workspace = true }
Expand Down
150 changes: 117 additions & 33 deletions crates/uv-build-backend/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
mod metadata;
mod pep639_glob;

use crate::metadata::{PyProjectToml, ValidationError};
use crate::pep639_glob::Pep639GlobError;
use flate2::write::GzEncoder;
use flate2::Compression;
use fs_err::File;
use glob::{GlobError, PatternError};
use globset::{Glob, GlobSetBuilder};
use globset::GlobSetBuilder;
use itertools::Itertools;
use sha2::{Digest, Sha256};
use std::fs::FileType;
Expand All @@ -19,6 +16,7 @@ use thiserror::Error;
use tracing::{debug, trace};
use uv_distribution_filename::{SourceDistExtension, SourceDistFilename, WheelFilename};
use uv_fs::Simplified;
use uv_globfilter::{parse_portable_glob, GlobDirFilter, PortableGlobError};
use walkdir::WalkDir;
use zip::{CompressionMethod, ZipWriter};

Expand All @@ -30,16 +28,26 @@ pub enum Error {
Toml(#[from] toml::de::Error),
#[error("Invalid pyproject.toml")]
Validation(#[from] ValidationError),
#[error("Invalid `project.license-files` glob expression: `{0}`")]
Pep639Glob(String, #[source] Pep639GlobError),
#[error("The `project.license-files` entry is not a valid glob pattern: `{0}`")]
Pattern(String, #[source] PatternError),
/// [`GlobError`] is a wrapped io error.
#[error(transparent)]
Glob(#[from] GlobError),
#[error("Unsupported glob expression in: `{field}`")]
PortableGlob {
field: String,
#[source]
source: PortableGlobError,
},
/// <https://github.com/BurntSushi/ripgrep/discussions/2927>
#[error("Glob expressions caused to large regex in: `{field}`")]
GlobSetTooLarge {
field: String,
#[source]
source: globset::Error,
},
/// [`globset::Error`] shows the glob that failed to parse.
#[error(transparent)]
GlobSet(#[from] globset::Error),
#[error("Unsupported glob expression in: `{field}`")]
GlobSet {
field: String,
#[source]
err: globset::Error,
},
#[error("Failed to walk source tree: `{}`", root.user_display())]
WalkDir {
root: PathBuf,
Expand Down Expand Up @@ -322,7 +330,10 @@ pub fn build_wheel(
err,
})?;

let relative_path = entry.path().strip_prefix(&strip_root)?;
let relative_path = entry
.path()
.strip_prefix(&strip_root)
.expect("walkdir starts with root");
let relative_path_str = relative_path
.to_str()
.ok_or_else(|| Error::NotUtf8Path(relative_path.to_path_buf()))?;
Expand Down Expand Up @@ -354,10 +365,52 @@ pub fn build_wheel(
Ok(filename)
}

/// TODO(konsti): Wire this up with actual settings and remove this struct.
///
/// To select which files to include in the source distribution, we first add the includes, then
/// remove the excludes from that.
pub struct SourceDistSettings {
/// Glob expressions which files and directories to include in the source distribution.
///
/// Includes are anchored, which means that `pyproject.toml` includes only
/// `<project root>/pyproject.toml`. Use for example `assets/**/sample.csv` to include for all
/// `sample.csv` files in `<project root>/assets` or any child directory. To recursively include
/// all files under a directory, use a `/**` suffix, e.g. `src/**`. For performance and
/// reproducibility, avoid unanchored matches such as `**/sample.csv`.
///
/// The glob syntax is the reduced portable glob from
/// [PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).
include: Vec<String>,
/// Glob expressions which files and directories to exclude from the previous source
/// distribution includes.
///
/// Excludes are not, which means that `__pycache__` excludes all directories named
/// `__pycache__` and it's children anywhere. To anchor a directory, use a `/` prefix, e.g.,
/// `/dist` will exclude only `<project root>/dist`.
///
/// The glob syntax is the reduced portable glob from
/// [PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).
exclude: Vec<String>,
}

impl Default for SourceDistSettings {
fn default() -> Self {
Self {
include: vec!["src/**".to_string(), "pyproject.toml".to_string()],
exclude: vec![
"__pycache__".to_string(),
"*.pyc".to_string(),
"*.pyo".to_string(),
],
}
}
}

/// Build a source distribution from the source tree and place it in the output directory.
pub fn build_source_dist(
source_tree: &Path,
source_dist_directory: &Path,
settings: SourceDistSettings,
uv_version: &str,
) -> Result<SourceDistFilename, Error> {
let contents = fs_err::read_to_string(source_tree.join("pyproject.toml"))?;
Expand Down Expand Up @@ -392,42 +445,73 @@ pub fn build_source_dist(
)
.map_err(|err| Error::TarWrite(source_dist_path.clone(), err))?;

let includes = ["src/**/*", "pyproject.toml"];
let mut include_builder = GlobSetBuilder::new();
for include in includes {
include_builder.add(Glob::new(include)?);
let mut include_globs = Vec::new();
for include in settings.include {
let glob = parse_portable_glob(&include).map_err(|err| Error::PortableGlob {
field: "tool.uv.source-dist.include".to_string(),
source: err,
})?;
include_globs.push(glob.clone());
}
let include_matcher = include_builder.build()?;
let include_matcher =
GlobDirFilter::from_globs(&include_globs).map_err(|err| Error::GlobSetTooLarge {
field: "tool.uv.source-dist.include".to_string(),
source: err,
})?;

let excludes = ["__pycache__", "*.pyc", "*.pyo"];
let mut exclude_builder = GlobSetBuilder::new();
for exclude in excludes {
exclude_builder.add(Glob::new(exclude)?);
for exclude in settings.exclude {
let exclude = if let Some(exclude) = exclude.strip_prefix("/") {
exclude.to_string()
} else {
format!("**/{exclude}").to_string()
};
let glob = parse_portable_glob(&exclude).map_err(|err| Error::PortableGlob {
field: "tool.uv.source-dist.exclude".to_string(),
source: err,
})?;
exclude_builder.add(glob);
}
let exclude_matcher = exclude_builder.build()?;
let exclude_matcher = exclude_builder
.build()
.map_err(|err| Error::GlobSetTooLarge {
field: "tool.uv.source-dist.exclude".to_string(),
source: err,
})?;

// TODO(konsti): Add files linked by pyproject.toml

for file in WalkDir::new(source_tree).into_iter().filter_entry(|dir| {
let relative = dir
for entry in WalkDir::new(source_tree).into_iter().filter_entry(|entry| {
// TODO(konsti): This is should be prettier.
let relative = entry
.path()
.strip_prefix(source_tree)
.expect("walkdir starts with root");
// TODO(konsti): Also check that we're matching at least a prefix of an include matcher.
!exclude_matcher.is_match(relative)
.expect("walkdir starts with root")
.to_path_buf();

// Fast path: Don't descend into a directory that can't be included. This is the most
// important performance optimization, it avoids us descending e.g. into the `.venv`.
// While walkdir is generally cheap, we still need to avoid traversing data directories at
// least on the top level, and each IO operations has a high latency on network file
// systems (compared to reading the file).
include_matcher.match_directory(&relative) && !exclude_matcher.is_match(&relative)
}) {
let entry = file.map_err(|err| Error::WalkDir {
let entry = entry.map_err(|err| Error::WalkDir {
root: source_tree.to_path_buf(),
err,
})?;
// TODO(konsti): This is should be prettier.
let relative = entry
.path()
.strip_prefix(source_tree)
.expect("walkdir starts with root");
if !include_matcher.is_match(relative) {
.expect("walkdir starts with root")
.to_path_buf();

if !include_matcher.match_path(&relative) || exclude_matcher.is_match(&relative) {
trace!("Excluding {}", relative.user_display());
continue;
}
};

debug!("Including {}", relative.user_display());

let metadata = fs_err::metadata(entry.path())?;
Expand Down Expand Up @@ -462,7 +546,7 @@ pub fn build_source_dist(
.map_err(|err| Error::TarWrite(source_dist_path.clone(), err))?;
} else {
return Err(Error::UnsupportedFileType(
relative.to_path_buf(),
relative.clone(),
entry.file_type(),
));
}
Expand Down
Loading

0 comments on commit ddde029

Please sign in to comment.