Maximising Performance#

PyProBE uses Polars LazyFrames under-the-hood. This means that data isn’t loaded into memory and calculations aren’t run until the data is requested by the user, either as a plot or as a DataFrame. This is what makes working with PyProBE much faster than working with Pandas DataFrames, as this example notebook demonstrates.

Working with LazyFrames efficiently, though, requires use of some best practises which this notebook will demonstrate.

import timeit

import matplotlib.pyplot as plt

import pyprobe

# Load test data
data_directory = "../../../tests/sample_data/neware"
info_dictionary = {"test_name": "Sample", "device": "Neware"}


def load_data():
    """Helper function to load fresh data for each benchmark run."""
    cell_new = pyprobe.Cell(info=info_dictionary)
    cell_new.import_data(
        procedure_name="Sample",
        data_path=data_directory + "/sample_data_neware.parquet",
    )
    return (
        cell_new.procedure["Sample"].experiment("Break-in Cycles").cycle(1).discharge(0)
    )

Single get() with Multiple Arguments vs Multiple get() Calls#

When you need to retrieve multiple columns, the most efficient approach is to use a single get() call with multiple column arguments. This processes all columns in a single lazy evaluation plan, compared to calling get() separately for each column.

# Method 1: Multiple separate get() calls
def multiple_get_calls():
    result = load_data()
    _ = result.get("Time [s]")
    _ = result.get("Current [A]")
    _ = result.get("Voltage [V]")


# Method 2: Single get() with multiple column arguments
def single_get_multiple_args():
    result = load_data()
    _ = result.get("Time [s]", "Current [A]", "Voltage [V]")


# Benchmark the two methods
num_runs = 10
time_multiple_get = timeit.timeit(multiple_get_calls, number=num_runs) / num_runs
time_single_get = timeit.timeit(single_get_multiple_args, number=num_runs) / num_runs

# Visualize the results
plt.figure(figsize=(8, 6))
methods = [
    "Multiple get()\ncalls",
    "Single get()\nwith multiple args",
]
times = [
    time_multiple_get * 1000,
    time_single_get * 1000,
]
colors = ["#ff7f0e", "#1f77b4"]
bars = plt.bar(methods, times, color=colors)
plt.ylabel("Time (ms)")
plt.title("Single get() with Multiple Arguments vs Multiple get() Calls")
plt.ylim(0, max(times) * 1.2)

# Add value labels on bars
for bar, time in zip(bars, times):
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height,
        f"{time:.2f} ms",
        ha="center",
        va="bottom",
    )

plt.tight_layout()
plt.show()

Using collect() to Optimize Multiple get() Calls#

If you need to call get() multiple times, you can improve performance by calling collect() first. This materializes the lazy dataframe once, and subsequent get() calls operate on the collected data, avoiding repeated lazy evaluation.

# Benchmark multiple numbers of get() calls
num_calls_list = [1, 3, 5, 10, 15, 20]
times_multiple_get = []
times_collect_then_get = []

for num_calls in num_calls_list:
    # Method 1: Multiple separate get() calls
    def multiple_get_calls():
        result = load_data()
        for _ in range(num_calls):
            _ = result.get("Time [s]")
            _ = result.get("Current [A]")
            _ = result.get("Voltage [V]")

    # Method 2: Single collect() followed by multiple get() calls
    def single_collect_then_get():
        result = load_data()
        result.collect()
        for _ in range(num_calls):
            _ = result.get("Time [s]")
            _ = result.get("Current [A]")
            _ = result.get("Voltage [V]")

    # Benchmark
    num_runs = 10
    time_mg = timeit.timeit(multiple_get_calls, number=num_runs) / num_runs
    time_cg = timeit.timeit(single_collect_then_get, number=num_runs) / num_runs

    times_multiple_get.append(time_mg * 1000)  # Convert to ms
    times_collect_then_get.append(time_cg * 1000)  # Convert to ms

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(
    num_calls_list,
    times_multiple_get,
    marker="o",
    linewidth=2,
    markersize=8,
    label="Multiple get() calls",
    color="#ff7f0e",
)
plt.plot(
    num_calls_list,
    times_collect_then_get,
    marker="s",
    linewidth=2,
    markersize=8,
    label="Single collect() + get() calls",
    color="#2ca02c",
)
plt.xlabel("Number of get() Call Sets")
plt.ylabel("Total Time (ms)")
plt.title("Performance: Multiple get() Calls vs collect() + get() Calls")
plt.xticks(num_calls_list)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()