Maximising Performance#

PyProBE uses Polars LazyFrames under-the-hood. This means that data isn’t loaded into memory and calculations aren’t run until the data is requested by the user, either as a plot or as a DataFrame. This is what makes working with PyProBE much faster than working with Pandas DataFrames, as this example notebook demonstrates.

Working with LazyFrames efficiently, though, requires use of some best practises which this notebook will demonstrate.

import timeit

import matplotlib.pyplot as plt

import pyprobe
# Load test data
data_directory = "../../../tests/sample_data/neware"
info_dictionary = {"test_name": "Sample", "device": "Neware"}


def load_data():
    """Helper function to load fresh data for each benchmark run."""
    cell_new = pyprobe.Cell(info=info_dictionary)
    cell_new.import_data(
        procedure_name="Sample",
        data_path=data_directory + "/sample_data_neware.parquet",
    )
    return (
        cell_new.procedure["Sample"].experiment("Break-in Cycles").cycle(1).discharge(0)
    )

Single get() with Multiple Arguments vs Multiple get() Calls#

When you need to retrieve multiple columns, the most efficient approach is to use a single get() call with multiple column arguments. This processes all columns in a single lazy evaluation plan, compared to calling get() separately for each column.

# Method 1: Multiple separate get() calls
def multiple_get_calls():
    result = load_data()
    _ = result.get("Time [s]")
    _ = result.get("Current [A]")
    _ = result.get("Voltage [V]")


# Method 2: Single get() with multiple column arguments
def single_get_multiple_args():
    result = load_data()
    _ = result.get("Time [s]", "Current [A]", "Voltage [V]")


# Benchmark the two methods
num_runs = 10
time_multiple_get = timeit.timeit(multiple_get_calls, number=num_runs) / num_runs
time_single_get = timeit.timeit(single_get_multiple_args, number=num_runs) / num_runs

# Visualize the results
plt.figure(figsize=(8, 6))
methods = [
    "Multiple get()\ncalls",
    "Single get()\nwith multiple args",
]
times = [
    time_multiple_get * 1000,
    time_single_get * 1000,
]
colors = ["#ff7f0e", "#1f77b4"]
bars = plt.bar(methods, times, color=colors)
plt.ylabel("Time (ms)")
plt.title("Single get() with Multiple Arguments vs Multiple get() Calls")
plt.ylim(0, max(times) * 1.2)

# Add value labels on bars
for bar, time in zip(bars, times):
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height,
        f"{time:.2f} ms",
        ha="center",
        va="bottom",
    )

plt.tight_layout()
plt.show()

Using collect() to Optimize Multiple get() Calls#

If you need to call get() multiple times, you can improve performance by calling collect() first. This materializes the lazy dataframe once, and subsequent get() calls operate on the collected data, avoiding repeated lazy evaluation.

# Benchmark multiple numbers of get() calls
num_calls_list = [1, 3, 5, 10, 15, 20]
times_multiple_get = []
times_collect_then_get = []

for num_calls in num_calls_list:
    # Method 1: Multiple separate get() calls
    def multiple_get_calls():
        result = load_data()
        for _ in range(num_calls):
            _ = result.get("Time [s]")
            _ = result.get("Current [A]")
            _ = result.get("Voltage [V]")

    # Method 2: Single collect() followed by multiple get() calls
    def single_collect_then_get():
        result = load_data()
        result.collect()
        for _ in range(num_calls):
            _ = result.get("Time [s]")
            _ = result.get("Current [A]")
            _ = result.get("Voltage [V]")

    # Benchmark
    num_runs = 10
    time_mg = timeit.timeit(multiple_get_calls, number=num_runs) / num_runs
    time_cg = timeit.timeit(single_collect_then_get, number=num_runs) / num_runs

    times_multiple_get.append(time_mg * 1000)  # Convert to ms
    times_collect_then_get.append(time_cg * 1000)  # Convert to ms

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(
    num_calls_list,
    times_multiple_get,
    marker="o",
    linewidth=2,
    markersize=8,
    label="Multiple get() calls",
    color="#ff7f0e",
)
plt.plot(
    num_calls_list,
    times_collect_then_get,
    marker="s",
    linewidth=2,
    markersize=8,
    label="Single collect() + get() calls",
    color="#2ca02c",
)
plt.xlabel("Number of get() Call Sets")
plt.ylabel("Total Time (ms)")
plt.title("Performance: Multiple get() Calls vs collect() + get() Calls")
plt.xticks(num_calls_list)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[4], line 26
     22             _ = result.get("Voltage [V]")
     23 
     24     # Benchmark
     25     num_runs = 10
---> 26     time_mg = timeit.timeit(multiple_get_calls, number=num_runs) / num_runs
     27     time_cg = timeit.timeit(single_collect_then_get, number=num_runs) / num_runs
     28 
     29     times_multiple_get.append(time_mg * 1000)  # Convert to ms

File ~/.asdf/installs/python/3.12.12/lib/python3.12/timeit.py:237, in timeit(stmt, setup, timer, number, globals)
    234 def timeit(stmt="pass", setup="pass", timer=default_timer,
    235            number=default_number, globals=None):
    236     """Convenience function to create Timer object and call timeit method."""
--> 237     return Timer(stmt, setup, timer, globals).timeit(number)

File ~/.asdf/installs/python/3.12.12/lib/python3.12/timeit.py:180, in Timer.timeit(self, number)
    178 gc.disable()
    179 try:
--> 180     timing = self.inner(it, self.timer)
    181 finally:
    182     if gcold:

File <timeit-src>:6, in inner(_it, _timer, _stmt)
      2 'Could not get source, probably due dynamically evaluated source code.'

Cell In[4], line 11, in multiple_get_calls()
      8     def multiple_get_calls():
      9         result = load_data()
     10         for _ in range(num_calls):
---> 11             _ = result.get("Time [s]")
     12             _ = result.get("Current [A]")
     13             _ = result.get("Voltage [V]")

File ~/checkouts/readthedocs.org/user_builds/pyprobe/checkouts/latest/pyprobe/result.py:328, in Result.get(self, *column_names)
    326     raise ValueError(error_msg)
    327 self.check_columns(list(column_names))
--> 328 array = self.lf.select(*column_names).collect().to_numpy()
    329 if len(column_names) == 1:
    330     return array.T[0]

File ~/checkouts/readthedocs.org/user_builds/pyprobe/checkouts/latest/.venv/lib/python3.12/site-packages/polars/_utils/deprecation.py:97, in deprecate_streaming_parameter.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
     93         kwargs["engine"] = "in-memory"
     95     del kwargs["streaming"]
---> 97 return function(*args, **kwargs)

File ~/checkouts/readthedocs.org/user_builds/pyprobe/checkouts/latest/.venv/lib/python3.12/site-packages/polars/lazyframe/opt_flags.py:343, in forward_old_opt_flags.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    340         optflags = cb(optflags, kwargs.pop(key))  # type: ignore[no-untyped-call,unused-ignore]
    342 kwargs["optimizations"] = optflags
--> 343 return function(*args, **kwargs)

File ~/checkouts/readthedocs.org/user_builds/pyprobe/checkouts/latest/.venv/lib/python3.12/site-packages/polars/lazyframe/frame.py:2510, in LazyFrame.collect(self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, cluster_with_columns, collapse_joins, no_optimization, engine, background, optimizations, **_kwargs)
   2508 # Only for testing purposes
   2509 callback = _kwargs.get("post_opt_callback", callback)
-> 2510 return wrap_df(ldf.collect(engine, callback))

KeyboardInterrupt: