diff --git a/Cargo.lock b/Cargo.lock
index bbbdf31f5..8b689c7af 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5229,9 +5229,9 @@ dependencies = [
[[package]]
name = "pyo3"
-version = "0.22.6"
+version = "0.23.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884"
+checksum = "e484fd2c8b4cb67ab05a318f1fd6fa8f199fcc30819f08f07d200809dba26c15"
dependencies = [
"cfg-if",
"indoc",
@@ -5247,9 +5247,9 @@ dependencies = [
[[package]]
name = "pyo3-build-config"
-version = "0.22.6"
+version = "0.23.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38"
+checksum = "dc0e0469a84f208e20044b98965e1561028180219e35352a2afaf2b942beff3b"
dependencies = [
"once_cell",
"target-lexicon",
@@ -5257,9 +5257,9 @@ dependencies = [
[[package]]
name = "pyo3-ffi"
-version = "0.22.6"
+version = "0.23.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636"
+checksum = "eb1547a7f9966f6f1a0f0227564a9945fe36b90da5a93b3933fc3dc03fae372d"
dependencies = [
"libc",
"pyo3-build-config",
@@ -5267,9 +5267,9 @@ dependencies = [
[[package]]
name = "pyo3-macros"
-version = "0.22.6"
+version = "0.23.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453"
+checksum = "fdb6da8ec6fa5cedd1626c886fc8749bdcbb09424a86461eb8cdf096b7c33257"
dependencies = [
"proc-macro2",
"pyo3-macros-backend",
@@ -5279,9 +5279,9 @@ dependencies = [
[[package]]
name = "pyo3-macros-backend"
-version = "0.22.6"
+version = "0.23.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe"
+checksum = "38a385202ff5a92791168b1136afae5059d3ac118457bb7bc304c197c2d33e7d"
dependencies = [
"heck",
"proc-macro2",
diff --git a/Cargo.toml b/Cargo.toml
index 882b88147..45db90203 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -189,7 +189,7 @@ polars = { version = "0.45", features = [
"timezones",
], optional = true }
publicsuffix = { version = "2.2", optional = true }
-pyo3 = { version = "0.22", features = ["auto-initialize"], optional = true }
+pyo3 = { version = "0.23", features = ["auto-initialize"], optional = true }
qsv-dateparser = "0.12"
qsv_docopt = "1.8"
qsv-stats = "0.25"
diff --git a/README.md b/README.md
index 2807408a7..c10b90061 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,7 @@
| [pro](/src/cmd/pro.rs#L2) | Interact with the [qsv pro](https://qsvpro.dathere.com) API. |
| [prompt](/src/cmd/prompt.rs#L2)✨ | Open a file dialog to either pick a file as input or save output to a file. |
| [pseudo](/src/cmd/pseudo.rs#L2)
🔣👆 | [Pseudonymise](https://en.wikipedia.org/wiki/Pseudonymization) the value of the given column by replacing them with an incremental identifier. |
-| [py](/src/cmd/python.rs#L2)✨
📇🔣 | Create a new computed column or filter rows by evaluating a python expression on every row of a CSV file. Python's [f-strings](https://www.freecodecamp.org/news/python-f-strings-tutorial-how-to-use-f-strings-for-string-formatting/) is particularly useful for extended formatting, [with the ability to evaluate Python expressions as well](https://github.com/dathere/qsv/blob/4cd00dca88addf0d287247fa27d40563b6d46985/src/cmd/python.rs#L23-L31). |
+| [py](/src/cmd/python.rs#L2)✨
📇🔣 | Create a new computed column or filter rows by evaluating a Python expression on every row of a CSV file. Python's [f-strings](https://www.freecodecamp.org/news/python-f-strings-tutorial-how-to-use-f-strings-for-string-formatting/) is particularly useful for extended formatting, [with the ability to evaluate Python expressions as well](https://github.com/dathere/qsv/blob/4cd00dca88addf0d287247fa27d40563b6d46985/src/cmd/python.rs#L23-L31). [Requires Python 3.8 or greater](https://github.com/dathere/qsv/blob/master/docs/INTERPRETERS.md#building-qsv-with-python-feature). |
| [rename](/src/cmd/rename.rs#L2) | Rename the columns of a CSV efficiently. |
| [replace](/src/cmd/replace.rs#L2)
📇👆 | Replace CSV data using a regex. Applies the regex to each field individually. |
| [reverse](/src/cmd/reverse.rs#L2)
📇🤯 | Reverse order of rows in a CSV. Unlike the `sort --reverse` command, it preserves the order of rows with the same key. If an index is present, it works with constant memory. Otherwise, it will load all the data into memory. |
diff --git a/docs/FEATURES.md b/docs/FEATURES.md
index 6f8ced893..b01b8b817 100644
--- a/docs/FEATURES.md
+++ b/docs/FEATURES.md
@@ -12,7 +12,7 @@
* `luau` - enable `luau` command. Embeds a [Luau](https://luau-lang.org) interpreter into qsv. [Luau has type-checking, sandboxing, additional language operators, increased performance & other improvements](https://luau-lang.org/2022/11/04/luau-origins-and-evolution.html) over Lua. Luau is the DSL of qsv - as its statically linked, has a MUCH smaller footprint (in both file size and memory without having to deal with Python's infamous [Global Interpreter Lock](https://wiki.python.org/moin/GlobalInterpreterLock)) & is faster (in both startup & execution time) than Python.
* `polars` - enables all [Polars](https://pola.rs)-powered commands (currently, `joinp` and `sqlp`. Also enables polars mode in `count`). Note that Polars is a very powerful library, but it has a lot of dependencies that drastically increases both compile time and binary size.
* `prompt` - enable `prompt` command.
-* `python` - enable `py` command. Note that qsv will look for the shared library for the Python version (Python 3.7 & above supported) it was compiled against & will abort on startup if the library is not found, even if you're NOT using the `py` command. Check [Python](#python) section for more info. Though Luau is the preferred DSL for qsv for all the reasons stated above, Python is still the lingua franca of data wrangling.
+* `python` - enable `py` command. Note that qsv will look for the shared library for the Python version (Python 3.8 & above supported) it was compiled against & will abort on startup if the library is not found, even if you're NOT using the `py` command. Check [Python](#python) section for more info. Though Luau is the preferred DSL for qsv for all the reasons stated above, Python is still the lingua franca of data wrangling.
* `to` - enables the `to` command.
* `self_update` - enable self-update engine, checking GitHub for the latest release. Note that if you manually built qsv, `self-update` will only alert you about new releases (it checks GitHub for the latest release 10% of the time upon startup unless the `QSV_NO_UPDATE` environment variable is set). It will NOT offer the choice to update itself to the prebuilt binaries published on GitHub.
You need not worry that your manually built qsv will be overwritten by a self-update.
diff --git a/docs/INTERPRETERS.md b/docs/INTERPRETERS.md
index ffa9f8519..2d015c3b6 100644
--- a/docs/INTERPRETERS.md
+++ b/docs/INTERPRETERS.md
@@ -14,20 +14,20 @@ As date manipulation is often needed, the [LuaDate](https://tieske.github.io/dat
Finally, as [qsv's DSL](../README.md#luau_deeplink) (👑), `luau` will gain even more features over time compared to the `python` feature.
-[Luau 0.640](https://github.com/Roblox/luau/releases/tag/0.640) is currently embedded - qsv's policy is to use the latest stable Luau version at the time of each qsv release.
+[Luau 0.653](https://github.com/Roblox/luau/releases/tag/0.653) is currently embedded - qsv's policy is to use the latest stable Luau version at the time of each qsv release.
## Building qsv with python feature
-If you wish to build qsv with the `python` feature, make sure you have the development libraries for the desired Python version (Python 3.7 and above are supported) installed when doing so (e.g. on Debian/Ubuntu - `apt-get install python-dev`; on CentOS/RedHat/Amazon Linux - `yum install python-devel`; on Windows and macOS - use the [Python installer](https://www.python.org/downloads/) for the desired version).
+If you wish to build qsv with the `python` feature, make sure you have the development libraries for the desired Python version (Python 3.8 and above are supported) installed when doing so (e.g. on Debian/Ubuntu - `apt-get install python-dev`; on CentOS/RedHat/Amazon Linux - `yum install python-devel`; on Windows and macOS - use the [Python installer](https://www.python.org/downloads/) for the desired version).
If you plan to distribute your manually built `qsv` with the `python` feature, `qsv` will look for the specific version of Python shared libraries (libpython* on Linux/macOS, python*.dll on Windows) against which it was compiled starting with the current directory & abort with an error if not found, detailing the Python library it was looking for.
Note that this will happen on qsv startup, even if you're NOT running the `py` command.
When building from source - [PyO3](https://pyo3.rs) - the underlying crate that enables the `python` feature, uses a build script to determine the Python version & set the correct linker arguments. By default it uses the python3 executable.
-You can override this by setting `PYO3_PYTHON` (e.g., `PYO3_PYTHON=python3.7`), before installing/compiling qsv. See the [PyO3 User Guide](https://pyo3.rs/v0.17.1/building_and_distribution.html) for more information.
+You can override this by setting `PYO3_PYTHON` (e.g., `PYO3_PYTHON=python3.7`), before installing/compiling qsv. See the [PyO3 User Guide](https://pyo3.rs/v0.23.3/building-and-distribution.html) for more information.
Consider using the [`luau`](/src/cmd/luau.rs#L2) command instead of the [`py`]((/src/cmd/python.rs#L2)) command if the operation you're trying to do can be done with `luau` - as `luau` is statically linked, has no external dependencies, much faster than `py`, can do aggregations, supports random access, has a bevy of qsv helper functions, and allows mapping of multiple new columns.
-The `py` command cannot do aggregations because [PyO3's GIL-bound memory](https://pyo3.rs/v0.17.2/memory.html#gil-bound-memory) limitations will quickly consume a lot of memory (see [issue 449](https://github.com/dathere/qsv/issues/449#issuecomment-1226095316) for details).
-To prevent this, the `py` command processes CSVs in batches (default: 30,000 records), with a GIL pool for each batch, so no globals are available across batches.
+The `py` command cannot do aggregations because Python's Global Interpreter Lock (GIL) limitations will quickly consume a lot of memory (see [issue 449](https://github.com/dathere/qsv/issues/449#issuecomment-1226095316) for details).
+To prevent this, the `py` command processes CSVs in batches (default: 50,000 records), with a GIL pool for each batch, so no globals are available across batches.
diff --git a/src/cmd/python.rs b/src/cmd/python.rs
index a6949ceb7..150b293f1 100644
--- a/src/cmd/python.rs
+++ b/src/cmd/python.rs
@@ -1,5 +1,5 @@
static USAGE: &str = r#"
-Create a new computed column or filter rows by evaluating a python expression on
+Create a new computed column or filter rows by evaluating a Python expression on
every row of a CSV file.
The executed Python has 4 ways to reference cell values (as strings):
@@ -72,9 +72,9 @@ Some usage examples:
Also, the following Python modules are automatically loaded and available to the user -
builtsin, math, random & datetime. The user can import additional modules with the --helper option,
- with the ability to use any python module that's installed in the current python virtualenv.
+ with the ability to use any Python module that's installed in the current Python virtualenv.
- The python expression is evaluated on a per record basis.
+ The Python expression is evaluated on a per record basis.
With "py map", if the expression is invalid for a record, "" is returned for that record.
With "py filter", if the expression is invalid for a record, that record is not filtered.
@@ -92,11 +92,11 @@ Usage:
qsv py --help
py argument:
- Can either be a python expression, or if it starts with
+ Can either be a Python expression, or if it starts with
"file:" or ends with ".py" - the filepath from which to
- load the python expression.
+ load the Python expression.
Note that argument expects a SINGLE expression, and not
- a full-blown python script. Use the --helper option
+ a full-blown Python script. Use the --helper option
to load helper code that you can call from the expression.
py options:
@@ -108,8 +108,7 @@ py options:
-b, --batch The number of rows per batch to process before
releasing memory and acquiring a new GILpool.
Set to 0 to process the entire file in one batch.
- See https://pyo3.rs/v0.21.0/memory.html#gil-bound-memory
- for more info. [default: 50000]
+ [default: 50000]
Common options:
-h, --help Display this message
@@ -123,7 +122,7 @@ Common options:
-p, --progressbar Show progress bars. Not valid for stdin.
"#;
-use std::fs;
+use std::{ffi::CString, fs};
use indicatif::{ProgressBar, ProgressDrawTarget};
use pyo3::{
@@ -201,34 +200,33 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
if debug_flag {
Python::with_gil(|py| {
- let msg = format!("Detected python={}", py.version());
+ let msg = format!("Detected Python={}", py.version());
winfo!("{msg}");
});
}
- let arg_expression =
- if let Some(expression_filepath) = args.arg_expression.strip_prefix("file:") {
- match fs::read_to_string(expression_filepath) {
- Ok(file_contents) => file_contents,
- Err(e) => return fail_clierror!("Cannot load Python expression from file: {e}"),
- }
- } else if std::path::Path::new(&args.arg_expression)
- .extension()
- .is_some_and(|ext| ext.eq_ignore_ascii_case("py"))
- {
- match fs::read_to_string(args.arg_expression.clone()) {
- Ok(file_contents) => file_contents,
- Err(e) => return fail_clierror!("Cannot load .py file: {e}"),
- }
- } else {
- args.arg_expression.clone()
- };
+ let expression = if let Some(expression_filepath) = args.arg_expression.strip_prefix("file:") {
+ match fs::read_to_string(expression_filepath) {
+ Ok(file_contents) => file_contents,
+ Err(e) => return fail_clierror!("Cannot load Python expression from file: {e}"),
+ }
+ } else if std::path::Path::new(&args.arg_expression)
+ .extension()
+ .is_some_and(|ext| ext.eq_ignore_ascii_case("py"))
+ {
+ match fs::read_to_string(args.arg_expression.clone()) {
+ Ok(file_contents) => file_contents,
+ Err(e) => return fail_clierror!("Cannot load .py file: {e}"),
+ }
+ } else {
+ args.arg_expression.clone()
+ };
let mut helper_text = String::new();
if let Some(helper_file) = args.flag_helper {
helper_text = match fs::read_to_string(helper_file) {
Ok(helper_file) => helper_file,
- Err(e) => return fail_clierror!("Cannot load python file: {e}"),
+ Err(e) => return fail_clierror!("Cannot load Python file: {e}"),
}
}
@@ -263,7 +261,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
progress.set_draw_target(ProgressDrawTarget::hidden());
}
- // ensure col/header names are valid and safe python variables
+ // ensure col/header names are valid and safe Python variables
let (header_vec, _) = util::safe_header_names(&headers, true, false, None, "_", false);
// amortize memory allocation by reusing record
@@ -280,14 +278,26 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
// reuse batch buffers
let mut batch = Vec::with_capacity(batch_size);
+ // safety: safe to unwrap as these are statically defined
+ let helpers_code = CString::new(HELPERS).unwrap();
+ let helpers_filename = CString::new("qsv_helpers.py").unwrap();
+ let helpers_module_name = CString::new("qsv_helpers").unwrap();
+ let user_helpers_code = CString::new(helper_text)
+ .map_err(|e| format!("Failed to create CString from helper text: {e}"))?;
+ // safety: safe to unwrap as these are statically defined
+ let user_helpers_filename = CString::new("qsv_user_helpers.py").unwrap();
+ let user_helpers_module_name = CString::new("qsv_uh").unwrap();
+
+ let arg_expression = CString::new(expression)
+ .map_err(|e| format!("Failed to create CString from expression: {e}"))?;
+
let mut row_number = 0_u64;
let debug_flag = log::log_enabled!(log::Level::Debug);
// main loop to read CSV and construct batches.
- // we batch python operations so that the GILPool does not get very large
+ // we batch Python operations so that the GILPool does not get very large
// as we release the pool after each batch
// loop exits when batch is empty.
- // see https://pyo3.rs/latest/memory.html#gil-bound-memory for more info.
'batch_loop: loop {
for _ in 0..batch_size {
match rdr.read_record(&mut batch_record) {
@@ -312,19 +322,23 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
Python::with_gil(|py| -> PyResult<()> {
let batch_ref = &mut batch;
- let helpers = PyModule::from_code_bound(py, HELPERS, "qsv_helpers.py", "qsv_helpers")?;
- let batch_globals = PyDict::new_bound(py);
- let batch_locals = PyDict::new_bound(py);
-
- let user_helpers =
- PyModule::from_code_bound(py, &helper_text, "qsv_user_helpers.py", "qsv_uh")?;
+ let helpers =
+ PyModule::from_code(py, &helpers_code, &helpers_filename, &helpers_module_name)?;
+ let batch_globals = PyDict::new(py);
+ let batch_locals = PyDict::new(py);
+ let user_helpers = PyModule::from_code(
+ py,
+ &user_helpers_code,
+ &user_helpers_filename,
+ &user_helpers_module_name,
+ )?;
batch_globals.set_item(intern!(py, "qsv_uh"), user_helpers)?;
// Global imports
- let builtins = PyModule::import_bound(py, "builtins")?;
- let math_module = PyModule::import_bound(py, "math")?;
- let random_module = PyModule::import_bound(py, "random")?;
- let datetime_module = PyModule::import_bound(py, "datetime")?;
+ let builtins = PyModule::import(py, "builtins")?;
+ let math_module = PyModule::import(py, "math")?;
+ let random_module = PyModule::import(py, "random")?;
+ let datetime_module = PyModule::import(py, "datetime")?;
batch_globals.set_item("__builtins__", builtins)?;
batch_globals.set_item("math", math_module)?;
@@ -367,8 +381,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
py_row.call_method1(intern!(py, "_update_underlying_data"), (row_data,))?;
let result =
- match py.eval_bound(&arg_expression, Some(&batch_globals), Some(&batch_locals))
- {
+ match py.eval(&arg_expression, Some(&batch_globals), Some(&batch_locals)) {
Ok(result) => result,
Err(e) => {
error_count += 1;