Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions crates/core/src/analyzer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Analyzer rules layered on top of DataFusion's defaults.

use datafusion::common::Result;
use datafusion::common::config::ConfigOptions;
use datafusion::logical_expr::LogicalPlan;
use datafusion::optimizer::AnalyzerRule;

/// Resolve [`LambdaVariable`] references into bound lambda parameters.
///
/// DataFusion's SQL planner resolves lambda variables inline as it plans a
/// higher-order function call, so SQL-built plans never carry unresolved
/// variables. Plans assembled programmatically through the Python expression
/// builder (e.g. `array_transform(col("xs"), lambda_(["v"], lambda_var("v")))`)
/// do carry them, and nothing in the default analyzer resolves them. This rule
/// runs [`LogicalPlan::resolve_lambda_variables`] so both construction paths
/// reach the optimizer with bound lambdas.
///
/// [`LambdaVariable`]: datafusion::logical_expr::expr::LambdaVariable
#[derive(Debug)]
pub struct ResolveLambdaVariables {}

impl ResolveLambdaVariables {
pub fn new() -> Self {
Self {}
}
}

impl Default for ResolveLambdaVariables {
fn default() -> Self {
Self::new()
}
}

impl AnalyzerRule for ResolveLambdaVariables {
fn analyze(&self, plan: LogicalPlan, _config: &ConfigOptions) -> Result<LogicalPlan> {
plan.resolve_lambda_variables().map(|t| t.data)
}

fn name(&self) -> &str {
"resolve_lambda_variables"
}
}
1 change: 1 addition & 0 deletions crates/core/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@ impl PySessionContext {
.with_config(config)
.with_runtime_env(runtime)
.with_default_features()
.with_analyzer_rule(Arc::new(crate::analyzer::ResolveLambdaVariables::new()))
.build();
let ctx = Arc::new(SessionContext::new_with_state(session_state));
Ok(PySessionContext {
Expand Down
37 changes: 37 additions & 0 deletions crates/core/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,37 @@ fn array_slice(array: PyExpr, begin: PyExpr, end: PyExpr, stride: Option<PyExpr>
.into()
}

/// Create a lambda expression from a list of parameter names and a body
/// expression. The body should reference the parameters via [`lambda_var`].
/// Exposed to Python as `lambda_` because `lambda` is a reserved keyword.
#[pyfunction]
#[pyo3(name = "lambda_")]
fn py_lambda(params: Vec<String>, body: PyExpr) -> PyExpr {
datafusion::logical_expr::lambda(params, body.into()).into()
}

/// Create an unresolved lambda variable reference by name. The owning
/// higher-order function resolves it against its lambda parameters during
/// planning.
#[pyfunction]
fn lambda_var(name: String) -> PyExpr {
datafusion::logical_expr::lambda_var(name).into()
}

/// Higher-order function: apply `transform` (a lambda) to each element of
/// `array`, returning a new array of the results.
#[pyfunction]
fn array_transform(array: PyExpr, transform: PyExpr) -> PyExpr {
datafusion::functions_nested::expr_fn::array_transform(array.into(), transform.into()).into()
}

/// Higher-order function: return true if any element of `array` satisfies
/// `predicate` (a lambda returning a boolean).
#[pyfunction]
fn array_any_match(array: PyExpr, predicate: PyExpr) -> PyExpr {
datafusion::functions_nested::expr_fn::array_any_match(array.into(), predicate.into()).into()
}

/// Computes a binary hash of the given data. type is the algorithm to use.
/// Standard algorithms are md5, sha224, sha256, sha384, sha512, blake2s, blake2b, and blake3.
// #[pyfunction(value, method)]
Expand Down Expand Up @@ -1082,6 +1113,12 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_wrapped(wrap_pyfunction!(encode))?;
m.add_wrapped(wrap_pyfunction!(decode))?;

// Lambda / higher-order functions
m.add_wrapped(wrap_pyfunction!(py_lambda))?;
m.add_wrapped(wrap_pyfunction!(lambda_var))?;
m.add_wrapped(wrap_pyfunction!(array_transform))?;
m.add_wrapped(wrap_pyfunction!(array_any_match))?;

// Array Functions
m.add_wrapped(wrap_pyfunction!(array_append))?;
m.add_wrapped(wrap_pyfunction!(array_concat))?;
Expand Down
1 change: 1 addition & 0 deletions crates/core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use mimalloc::MiMalloc;
use pyo3::prelude::*;

#[allow(clippy::borrow_deref_ref)]
pub mod analyzer;
pub mod catalog;
pub mod codec;
pub mod common;
Expand Down
42 changes: 42 additions & 0 deletions docs/source/user-guide/common-operations/expressions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,48 @@ This function returns a new array with the elements repeated.

In this example, the `repeated_array` column will contain `[[1, 2, 3], [1, 2, 3]]`.

Higher-order functions and lambdas
----------------------------------

Some array functions are *higher-order*: they take a lambda that runs once per
element. :py:func:`~datafusion.functions.array_transform` maps a lambda over
every element, and :py:func:`~datafusion.functions.array_any_match` returns
whether any element satisfies a predicate lambda.

The simplest way to supply a lambda is a Python ``lambda``. Its parameter names
become the lambda parameters, and its return value becomes the body.

.. ipython:: python

from datafusion import SessionContext, col
from datafusion import functions as f

ctx = SessionContext()
df = ctx.from_pydict({"a": [[1, 2, 3], [4, 5]]})
df.select(f.array_transform(col("a"), lambda v: v * 2).alias("doubled"))
df.select(f.array_any_match(col("a"), lambda v: v > 3).alias("has_big"))

If you need explicit control over parameter names, build the lambda with
:py:func:`~datafusion.functions.lambda_` and reference its parameters with
:py:func:`~datafusion.functions.lambda_var`. The following is equivalent to the
``array_transform`` call above.

.. ipython:: python

from datafusion import lit

double_fn = f.lambda_(["v"], f.lambda_var("v") * lit(2))
df.select(f.array_transform(col("a"), double_fn).alias("doubled"))

.. note::

Lambda expressions cannot yet be serialized: calling
:py:meth:`~datafusion.expr.Expr.to_bytes` or pickling an expression that
contains a lambda raises ``Lambda not implemented``. SQL lambda syntax
(``x -> x * 2``) is only parsed by dialects that support lambdas; set
``datafusion.sql_parser.dialect`` to ``DuckDB`` to use it. The Python
expression builder shown above works regardless of dialect.


Testing membership in a list
----------------------------
Expand Down
Loading
Loading