1
0
Fork 0
mirror of https://github.com/deepfakes/faceswap synced 2025-06-07 10:43:27 -04:00
faceswap/lib/gui/analysis/stats.py
2024-04-03 14:03:54 +01:00

1017 lines
39 KiB
Python

#!/usr/bin python3
""" Stats functions for the GUI.
Holds the globally loaded training session. This will either be a user selected session (loaded in
the analysis tab) or the currently training session.
"""
from __future__ import annotations
import logging
import os
import time
import typing as T
import warnings
from math import ceil
from threading import Event
import numpy as np
from lib.logger import parse_class_init
from lib.serializer import get_serializer
from .event_reader import TensorBoardLogs
logger = logging.getLogger(__name__)
class GlobalSession():
""" Holds information about a loaded or current training session by accessing a model's state
file and Tensorboard logs. This class should not be accessed directly, rather through
:attr:`lib.gui.analysis.Session`
"""
def __init__(self) -> None:
logger.debug(parse_class_init(locals()))
self._state: dict[str, T.Any] = {}
self._model_dir = ""
self._model_name = ""
self._tb_logs: TensorBoardLogs | None = None
self._summary: SessionsSummary | None = None
self._is_training = False
self._is_querying = Event()
logger.debug("Initialized %s", self.__class__.__name__)
@property
def is_loaded(self) -> bool:
""" bool: ``True`` if session data is loaded otherwise ``False`` """
return bool(self._model_dir)
@property
def is_training(self) -> bool:
""" bool: ``True`` if the loaded session is the currently training model, otherwise
``False`` """
return self._is_training
@property
def model_filename(self) -> str:
""" str: The full model filename """
return os.path.join(self._model_dir, self._model_name)
@property
def batch_sizes(self) -> dict[int, int]:
""" dict: The batch sizes for each session_id for the model. """
if not self._state:
return {}
return {int(sess_id): sess["batchsize"]
for sess_id, sess in self._state.get("sessions", {}).items()}
@property
def full_summary(self) -> list[dict]:
""" list: List of dictionaries containing summary statistics for each session id. """
assert self._summary is not None
return self._summary.get_summary_stats()
@property
def logging_disabled(self) -> bool:
""" bool: ``True`` if logging is enabled for the currently training session otherwise
``False``. """
if not self._state:
return True
max_id = str(max(int(idx) for idx in self._state["sessions"]))
return self._state["sessions"][max_id]["no_logs"]
@property
def session_ids(self) -> list[int]:
""" list: The sorted list of all existing session ids in the state file """
if self._tb_logs is None:
return []
return self._tb_logs.session_ids
def _load_state_file(self) -> None:
""" Load the current state file to :attr:`_state`. """
state_file = os.path.join(self._model_dir, f"{self._model_name}_state.json")
logger.debug("Loading State: '%s'", state_file)
serializer = get_serializer("json")
self._state = serializer.load(state_file)
logger.debug("Loaded state: %s", self._state)
def initialize_session(self,
model_folder: str,
model_name: str,
is_training: bool = False) -> None:
""" Initialize a Session.
Load's the model's state file, and sets the paths to any underlying Tensorboard logs, ready
for access on request.
Parameters
----------
model_folder: str,
If loading a session manually (e.g. for the analysis tab), then the path to the model
folder must be provided. For training sessions, this should be passed through from the
launcher
model_name: str, optional
If loading a session manually (e.g. for the analysis tab), then the model filename
must be provided. For training sessions, this should be passed through from the
launcher
is_training: bool, optional
``True`` if the session is being initialized for a training session, otherwise
``False``. Default: ``False``
"""
logger.debug("Initializing session: (is_training: %s)", is_training)
if self._model_dir == model_folder and self._model_name == model_name:
if is_training:
assert self._tb_logs is not None
self._tb_logs.set_training(is_training)
self._load_state_file()
self._is_training = True
logger.debug("Requested session is already loaded. Not initializing: (model_folder: "
"%s, model_name: %s)", model_folder, model_name)
return
self._is_training = is_training
self._model_dir = model_folder
self._model_name = model_name
self._load_state_file()
if not self.logging_disabled:
self._tb_logs = TensorBoardLogs(os.path.join(self._model_dir,
f"{self._model_name}_logs"),
is_training)
self._summary = SessionsSummary(self)
logger.debug("Initialized session. Session_IDS: %s", self.session_ids)
def stop_training(self) -> None:
""" Clears the internal training flag. To be called when training completes. """
self._is_training = False
if self._tb_logs is not None:
self._tb_logs.set_training(False)
def clear(self) -> None:
""" Clear the currently loaded session. """
self._state = {}
self._model_dir = ""
self._model_name = ""
del self._tb_logs
self._tb_logs = None
del self._summary
self._summary = None
self._is_training = False
def get_loss(self, session_id: int | None) -> dict[str, np.ndarray]:
""" Obtain the loss values for the given session_id.
Parameters
----------
session_id: int or ``None``
The session ID to return loss for. Pass ``None`` to return loss for all sessions.
Returns
-------
dict
Loss names as key, :class:`numpy.ndarray` as value. If No session ID was provided
all session's losses are collated
"""
self._wait_for_thread()
if self._is_training:
self._is_querying.set()
assert self._tb_logs is not None
loss_dict = self._tb_logs.get_loss(session_id=session_id)
if session_id is None:
all_loss: dict[str, list[float]] = {}
for key in sorted(loss_dict):
for loss_key, loss in loss_dict[key].items():
all_loss.setdefault(loss_key, []).extend(loss)
retval: dict[str, np.ndarray] = {key: np.array(val, dtype="float32")
for key, val in all_loss.items()}
else:
retval = loss_dict.get(session_id, {})
if self._is_training:
self._is_querying.clear()
return retval
@T.overload
def get_timestamps(self, session_id: None) -> dict[int, np.ndarray]:
...
@T.overload
def get_timestamps(self, session_id: int) -> np.ndarray:
...
def get_timestamps(self, session_id):
""" Obtain the time stamps keys for the given session_id.
Parameters
----------
session_id: int or ``None``
The session ID to return the time stamps for. Pass ``None`` to return time stamps for
all sessions.
Returns
-------
dict[int] or :class:`numpy.ndarray`
If a session ID has been given then a single :class:`numpy.ndarray` will be returned
with the session's time stamps. Otherwise a 'dict' will be returned with the session
IDs as key with :class:`numpy.ndarray` of timestamps as values
"""
self._wait_for_thread()
if self._is_training:
self._is_querying.set()
assert self._tb_logs is not None
retval = self._tb_logs.get_timestamps(session_id=session_id)
if session_id is not None:
retval = retval[session_id]
if self._is_training:
self._is_querying.clear()
return retval
def _wait_for_thread(self) -> None:
""" If a thread is querying the log files for live data, then block until task clears. """
while True:
if self._is_training and self._is_querying.is_set():
logger.debug("Waiting for available thread")
time.sleep(1)
continue
break
def get_loss_keys(self, session_id: int | None) -> list[str]:
""" Obtain the loss keys for the given session_id.
Parameters
----------
session_id: int or ``None``
The session ID to return the loss keys for. Pass ``None`` to return loss keys for
all sessions.
Returns
-------
list
The loss keys for the given session. If ``None`` is passed as session_id then a unique
list of all loss keys for all sessions is returned
"""
assert self._tb_logs is not None
loss_keys = {sess_id: list(logs.keys())
for sess_id, logs
in self._tb_logs.get_loss(session_id=session_id).items()}
if session_id is None:
retval: list[str] = list(set(loss_key
for session in loss_keys.values()
for loss_key in session))
else:
retval = loss_keys.get(session_id, [])
return retval
_SESSION = GlobalSession()
class SessionsSummary(): # pylint:disable=too-few-public-methods
""" Performs top level summary calculations for each session ID within the loaded or currently
training Session for display in the Analysis tree view.
Parameters
----------
session: :class:`GlobalSession`
The loaded or currently training session
"""
def __init__(self, session: GlobalSession) -> None:
logger.debug(parse_class_init(locals()))
self._session = session
self._state = session._state
self._time_stats: dict[int, dict[str, float | int]] = {}
self._per_session_stats: list[dict[str, T.Any]] = []
logger.debug("Initialized %s", self.__class__.__name__)
def get_summary_stats(self) -> list[dict]:
""" Compile the individual session statistics and calculate the total.
Format the stats for display
Returns
-------
list
A list of summary statistics dictionaries containing the Session ID, start time, end
time, elapsed time, rate, batch size and number of iterations for each session id
within the loaded data as well as the totals.
"""
logger.debug("Compiling sessions summary data")
self._get_time_stats()
self._get_per_session_stats()
if not self._per_session_stats:
return self._per_session_stats
total_stats = self._total_stats()
retval = self._per_session_stats + [total_stats]
retval = self._format_stats(retval)
logger.debug("Final stats: %s", retval)
return retval
def _get_time_stats(self) -> None:
""" Populates the attribute :attr:`_time_stats` with the start start time, end time and
data points for each session id within the loaded session if it has not already been
calculated.
If the main Session is currently training, then the training session ID is updated with the
latest stats.
"""
if not self._time_stats:
logger.debug("Collating summary time stamps")
self._time_stats = {
sess_id: {"start_time": np.min(timestamps) if np.any(timestamps) else 0,
"end_time": np.max(timestamps) if np.any(timestamps) else 0,
"iterations": timestamps.shape[0] if np.any(timestamps) else 0}
for sess_id, timestamps in T.cast(dict[int, np.ndarray],
self._session.get_timestamps(None)).items()}
elif _SESSION.is_training:
logger.debug("Updating summary time stamps for training session")
session_id = _SESSION.session_ids[-1]
latest = T.cast(np.ndarray, self._session.get_timestamps(session_id))
self._time_stats[session_id] = {
"start_time": np.min(latest) if np.any(latest) else 0,
"end_time": np.max(latest) if np.any(latest) else 0,
"iterations": latest.shape[0] if np.any(latest) else 0}
logger.debug("time_stats: %s", self._time_stats)
def _get_per_session_stats(self) -> None:
""" Populate the attribute :attr:`_per_session_stats` with a sorted list by session ID
of each ID in the training/loaded session. Stats contain the session ID, start, end and
elapsed times, the training rate, batch size and number of iterations for each session.
If a training session is running, then updates the training sessions stats only.
"""
if not self._per_session_stats:
logger.debug("Collating per session stats")
compiled = []
for session_id in self._time_stats:
logger.debug("Compiling session ID: %s", session_id)
if not self._state:
logger.debug("Session state dict doesn't exist. Most likely task has been "
"terminated during compilation")
return
compiled.append(self._collate_stats(session_id))
self._per_session_stats = list(sorted(compiled, key=lambda k: k["session"]))
elif self._session.is_training:
logger.debug("Collating per session stats for latest training data")
session_id = self._session.session_ids[-1]
ts_data = self._time_stats[session_id]
if session_id > len(self._per_session_stats):
self._per_session_stats.append(self._collate_stats(session_id))
stats = self._per_session_stats[-1]
start = np.nan_to_num(ts_data["start_time"])
end = np.nan_to_num(ts_data["end_time"])
stats["start"] = start
stats["end"] = end
stats["elapsed"] = int(end - start)
stats["iterations"] = ts_data["iterations"]
stats["rate"] = (((stats["batch"] * 2) * stats["iterations"])
/ stats["elapsed"] if stats["elapsed"] > 0 else 0)
logger.debug("per_session_stats: %s", self._per_session_stats)
def _collate_stats(self, session_id: int) -> dict[str, int | float]:
""" Collate the session summary statistics for the given session ID.
Parameters
----------
session_id: int
The session id to compile the stats for
Returns
-------
dict
The collated session summary statistics
"""
timestamps = self._time_stats[session_id]
start = np.nan_to_num(timestamps["start_time"])
end = np.nan_to_num(timestamps["end_time"])
elapsed = int(end - start)
batchsize = self._session.batch_sizes.get(session_id, 0)
retval = {
"session": session_id,
"start": start,
"end": end,
"elapsed": elapsed,
"rate": (((batchsize * 2) * timestamps["iterations"]) / elapsed
if elapsed != 0 else 0),
"batch": batchsize,
"iterations": timestamps["iterations"]}
logger.debug(retval)
return retval
def _total_stats(self) -> dict[str, str | int | float]:
""" Compile the Totals stats.
Totals are fully calculated each time as they will change on the basis of the training
session.
Returns
-------
dict
The Session name, start time, end time, elapsed time, rate, batch size and number of
iterations for all session ids within the loaded data.
"""
logger.debug("Compiling Totals")
elapsed = 0
examples = 0
iterations = 0
batchset = set()
total_summaries = len(self._per_session_stats)
for idx, summary in enumerate(self._per_session_stats):
if idx == 0:
starttime = summary["start"]
if idx == total_summaries - 1:
endtime = summary["end"]
elapsed += summary["elapsed"]
examples += ((summary["batch"] * 2) * summary["iterations"])
batchset.add(summary["batch"])
iterations += summary["iterations"]
batch = ",".join(str(bs) for bs in batchset)
totals = {"session": "Total",
"start": starttime,
"end": endtime,
"elapsed": elapsed,
"rate": examples / elapsed if elapsed != 0 else 0,
"batch": batch,
"iterations": iterations}
logger.debug(totals)
return totals
def _format_stats(self, compiled_stats: list[dict]) -> list[dict]:
""" Format for the incoming list of statistics for display.
Parameters
----------
compiled_stats: list
List of summary statistics dictionaries to be formatted for display
Returns
-------
list
The original statistics formatted for display
"""
logger.debug("Formatting stats")
retval = []
for summary in compiled_stats:
hrs, mins, secs = self._convert_time(summary["elapsed"])
stats = {}
for key in summary:
if key not in ("start", "end", "elapsed", "rate"):
stats[key] = summary[key]
continue
stats["start"] = time.strftime("%x %X", time.localtime(summary["start"]))
stats["end"] = time.strftime("%x %X", time.localtime(summary["end"]))
stats["elapsed"] = f"{hrs}:{mins}:{secs}"
stats["rate"] = f"{summary['rate']:.1f}"
retval.append(stats)
return retval
@classmethod
def _convert_time(cls, timestamp: float) -> tuple[str, str, str]:
""" Convert time stamp to total hours, minutes and seconds.
Parameters
----------
timestamp: float
The Unix timestamp to be converted
Returns
-------
tuple
(`hours`, `minutes`, `seconds`) as strings
"""
ihrs = int(timestamp // 3600)
hrs = f"{ihrs:02d}" if ihrs < 10 else str(ihrs)
mins = f"{(int(timestamp % 3600) // 60):02d}"
secs = f"{(int(timestamp % 3600) % 60):02d}"
return hrs, mins, secs
class Calculations():
""" Class that performs calculations on the :class:`GlobalSession` raw data for the given
session id.
Parameters
----------
session_id: int or ``None``
The session id number for the selected session from the Analysis tab. Should be ``None``
if all sessions are being calculated
display: {"loss", "rate"}, optional
Whether to display a graph for loss or training rate. Default: `"loss"`
loss_keys: list, optional
The list of loss keys to display on the graph. Default: `["loss"]`
selections: list, optional
The selected annotations to display. Default: `["raw"]`
avg_samples: int, optional
The number of samples to use for performing moving average calculation. Default: `500`.
smooth_amount: float, optional
The amount of smoothing to apply for performing smoothing calculation. Default: `0.9`.
flatten_outliers: bool, optional
``True`` if values significantly away from the average should be excluded, otherwise
``False``. Default: ``False``
"""
def __init__(self, session_id,
display: str = "loss",
loss_keys: list[str] | str = "loss",
selections: list[str] | str = "raw",
avg_samples: int = 500,
smooth_amount: float = 0.90,
flatten_outliers: bool = False) -> None:
logger.debug(parse_class_init(locals()))
warnings.simplefilter("ignore", np.RankWarning)
self._session_id = session_id
self._display = display
self._loss_keys = loss_keys if isinstance(loss_keys, list) else [loss_keys]
self._selections = selections if isinstance(selections, list) else [selections]
self._is_totals = session_id is None
self._args: dict[str, int | float] = {"avg_samples": avg_samples,
"smooth_amount": smooth_amount,
"flatten_outliers": flatten_outliers}
self._iterations = 0
self._limit = 0
self._start_iteration = 0
self._stats: dict[str, np.ndarray] = {}
self.refresh()
logger.debug("Initialized %s", self.__class__.__name__)
@property
def iterations(self) -> int:
""" int: The number of iterations in the data set. """
return self._iterations
@property
def start_iteration(self) -> int:
""" int: The starting iteration number of a limit has been set on the amount of data. """
return self._start_iteration
@property
def stats(self) -> dict[str, np.ndarray]:
""" dict: The final calculated statistics """
return self._stats
def refresh(self) -> Calculations | None:
""" Refresh the stats """
logger.debug("Refreshing")
if not _SESSION.is_loaded:
logger.warning("Session data is not initialized. Not refreshing")
return None
self._iterations = 0
self._get_raw()
self._get_calculations()
self._remove_raw()
logger.debug("Refreshed: %s", {k: f"Total: {len(v)}, Min: {np.nanmin(v)}, "
f"Max: {np.nanmax(v)}, "
f"nans: {np.count_nonzero(np.isnan(v))}"
for k, v in self.stats.items()})
return self
def set_smooth_amount(self, amount: float) -> None:
""" Set the amount of smoothing to apply to smoothed graph.
Parameters
----------
amount: float
The amount of smoothing to apply to smoothed graph
"""
update = max(min(amount, 0.999), 0.001)
logger.debug("Setting smooth amount to: %s (provided value: %s)", update, amount)
self._args["smooth_amount"] = update
def update_selections(self, selection: str, option: bool) -> None:
""" Update the type of selected data.
Parameters
----------
selection: str
The selection to update (as can exist in :attr:`_selections`)
option: bool
``True`` if the selection should be included, ``False`` if it should be removed
"""
# TODO Somewhat hacky, to ensure values are inserted in the correct order. Fine for
# now as this is only called from Live Graph and selections can only be "raw" and
# smoothed.
if option:
if selection not in self._selections:
if selection == "raw":
self._selections.insert(0, selection)
else:
self._selections.append(selection)
else:
if selection in self._selections:
self._selections.remove(selection)
def set_iterations_limit(self, limit: int) -> None:
""" Set the number of iterations to display in the calculations.
If a value greater than 0 is passed, then the latest iterations up to the given
limit will be calculated.
Parameters
----------
limit: int
The number of iterations to calculate data for. `0` to calculate for all data
"""
limit = max(0, limit)
logger.debug("Setting iteration limit to: %s", limit)
self._limit = limit
def _get_raw(self) -> None:
""" Obtain the raw loss values and add them to a new :attr:`stats` dictionary. """
logger.debug("Getting Raw Data")
self.stats.clear()
iterations = set()
if self._display.lower() == "loss":
loss_dict = _SESSION.get_loss(self._session_id)
for loss_name, loss in loss_dict.items():
if loss_name not in self._loss_keys:
continue
iterations.add(loss.shape[0])
if self._limit > 0:
loss = loss[-self._limit:]
if self._args["flatten_outliers"]:
loss = self._flatten_outliers(loss)
self.stats[f"raw_{loss_name}"] = loss
self._iterations = 0 if not iterations else min(iterations)
if self._limit > 1:
self._start_iteration = max(0, self._iterations - self._limit)
self._iterations = min(self._iterations, self._limit)
else:
self._start_iteration = 0
if len(iterations) > 1:
# Crop all losses to the same number of items
if self._iterations == 0:
self._stats = {lossname: np.array([], dtype=loss.dtype)
for lossname, loss in self.stats.items()}
else:
self._stats = {lossname: loss[:self._iterations]
for lossname, loss in self.stats.items()}
else: # Rate calculation
data = self._calc_rate_total() if self._is_totals else self._calc_rate()
if self._args["flatten_outliers"]:
data = self._flatten_outliers(data)
self._iterations = data.shape[0]
self.stats["raw_rate"] = data
logger.debug("Got Raw Data: %s", {k: f"Total: {len(v)}, Min: {np.nanmin(v)}, "
f"Max: {np.nanmax(v)}, "
f"nans: {np.count_nonzero(np.isnan(v))}"
for k, v in self.stats.items()})
@classmethod
def _flatten_outliers(cls, data: np.ndarray) -> np.ndarray:
""" Remove the outliers from a provided list.
Removes data more than 1 Standard Deviation from the mean.
Parameters
----------
data: :class:`numpy.ndarray`
The data to remove the outliers from
Returns
-------
:class:`numpy.ndarray`
The data with outliers removed
"""
logger.debug("Flattening outliers: %s", data.shape)
mean = np.mean(np.nan_to_num(data))
limit = np.std(np.nan_to_num(data))
logger.debug("mean: %s, limit: %s", mean, limit)
retdata = np.where(abs(data - mean) < limit, data, mean)
logger.debug("Flattened outliers")
return retdata
def _remove_raw(self) -> None:
""" Remove raw values from :attr:`stats` if they are not requested. """
if "raw" in self._selections:
return
logger.debug("Removing Raw Data from output")
for key in list(self._stats.keys()):
if key.startswith("raw"):
del self._stats[key]
logger.debug("Removed Raw Data from output")
def _calc_rate(self) -> np.ndarray:
""" Calculate rate per iteration.
Returns
-------
:class:`numpy.ndarray`
The training rate for each iteration of the selected session
"""
logger.debug("Calculating rate")
batch_size = _SESSION.batch_sizes[self._session_id] * 2
retval = batch_size / np.diff(T.cast(np.ndarray,
_SESSION.get_timestamps(self._session_id)))
logger.debug("Calculated rate: Item_count: %s", len(retval))
return retval
@classmethod
def _calc_rate_total(cls) -> np.ndarray:
""" Calculate rate per iteration for all sessions.
Returns
-------
:class:`numpy.ndarray`
The training rate for each iteration in all sessions
Notes
-----
For totals, gaps between sessions can be large so the time difference has to be reset for
each session's rate calculation.
"""
logger.debug("Calculating totals rate")
batchsizes = _SESSION.batch_sizes
total_timestamps = _SESSION.get_timestamps(None)
rate: list[float] = []
for sess_id in sorted(total_timestamps.keys()):
batchsize = batchsizes[sess_id]
timestamps = total_timestamps[sess_id]
rate.extend((batchsize * 2) / np.diff(timestamps))
retval = np.array(rate)
logger.debug("Calculated totals rate: Item_count: %s", len(retval))
return retval
def _get_calculations(self) -> None:
""" Perform the required calculations and populate :attr:`stats`. """
for selection in self._selections:
if selection == "raw":
continue
logger.debug("Calculating: %s", selection)
method = getattr(self, f"_calc_{selection}")
raw_keys = [key for key in self._stats if key.startswith("raw_")]
for key in raw_keys:
selected_key = f"{selection}_{key.replace('raw_', '')}"
self._stats[selected_key] = method(self._stats[key])
logger.debug("Got calculations: %s", {k: f"Total: {len(v)}, Min: {np.nanmin(v)}, "
f"Max: {np.nanmax(v)}, "
f"nans: {np.count_nonzero(np.isnan(v))}"
for k, v in self.stats.items()
if not k.startswith("raw")})
def _calc_avg(self, data: np.ndarray) -> np.ndarray:
""" Calculate moving average.
Parameters
----------
data: :class:`numpy.ndarray`
The data to calculate the moving average for
Returns
-------
:class:`numpy.ndarray`
The moving average for the given data
"""
logger.debug("Calculating Average. Data points: %s", len(data))
window = T.cast(int, self._args["avg_samples"])
pad = ceil(window / 2)
datapoints = data.shape[0]
if datapoints <= (self._args["avg_samples"] * 2):
logger.info("Not enough data to compile rolling average")
return np.array([], dtype="float64")
avgs = np.cumsum(np.nan_to_num(data), dtype="float64")
avgs[window:] = avgs[window:] - avgs[:-window]
avgs = avgs[window - 1:] / window
avgs = np.pad(avgs, (pad, datapoints - (avgs.shape[0] + pad)), constant_values=(np.nan,))
logger.debug("Calculated Average: shape: %s", avgs.shape)
return avgs
def _calc_smoothed(self, data: np.ndarray) -> np.ndarray:
""" Smooth the data.
Parameters
----------
data: :class:`numpy.ndarray`
The data to smooth
Returns
-------
:class:`numpy.ndarray`
The smoothed data
"""
retval = _ExponentialMovingAverage(data, self._args["smooth_amount"])()
logger.debug("Calculated Smoothed data: shape: %s", retval.shape)
return retval
@classmethod
def _calc_trend(cls, data: np.ndarray) -> np.ndarray:
""" Calculate polynomial trend of the given data.
Parameters
----------
data: :class:`numpy.ndarray`
The data to calculate the trend for
Returns
-------
:class:`numpy.ndarray`
The trend for the given data
"""
logger.debug("Calculating Trend")
points = data.shape[0]
if points < 10:
dummy = np.empty((points, ), dtype=data.dtype)
dummy[:] = np.nan
return dummy
x_range = range(points)
trend = np.poly1d(np.polyfit(x_range, np.nan_to_num(data), 3))(x_range)
logger.debug("Calculated Trend: shape: %s", trend.shape)
return trend
class _ExponentialMovingAverage(): # pylint:disable=too-few-public-methods
""" Reshapes data before calculating exponential moving average, then iterates once over the
rows to calculate the offset without precision issues.
Parameters
----------
data: :class:`numpy.ndarray`
A 1 dimensional numpy array to obtain smoothed data for
amount: float
in the range (0.0, 1.0) The alpha parameter (smoothing amount) for the moving average.
Notes
-----
Adapted from: https://stackoverflow.com/questions/42869495
"""
def __init__(self, data: np.ndarray, amount: float) -> None:
logger.debug(parse_class_init(locals()))
assert data.ndim == 1
amount = min(max(amount, 0.001), 0.999)
self._data = np.nan_to_num(data)
self._alpha = 1. - amount
self._dtype = "float32" if data.dtype == np.float32 else "float64"
self._row_size = self._get_max_row_size()
self._out = np.empty_like(data, dtype=self._dtype)
logger.debug("Initialized %s", self.__class__.__name__)
def __call__(self) -> np.ndarray:
""" Perform the exponential moving average calculation.
Returns
-------
:class:`numpy.ndarray`
The smoothed data
"""
if self._data.size <= self._row_size:
self._ewma_vectorized(self._data, self._out) # Normal function can handle this input
else:
self._ewma_vectorized_safe() # Use the safe version
return self._out
def _get_max_row_size(self) -> int:
""" Calculate the maximum row size for the running platform for the given dtype.
Returns
-------
int
The maximum row size possible on the running platform for the given :attr:`_dtype`
Notes
-----
Might not be the optimal value for speed, which is hard to predict due to numpy
optimizations.
"""
# Use :func:`np.finfo(dtype).eps` if you are worried about accuracy and want to be safe.
epsilon = np.finfo(self._dtype).tiny # pylint:disable=no-member
# If this produces an OverflowError, make epsilon larger:
retval = int(np.log(epsilon) / np.log(1 - self._alpha)) + 1
logger.debug("row_size: %s", retval)
return retval
def _ewma_vectorized_safe(self) -> None:
""" Perform the vectorized exponential moving average in a safe way. """
num_rows = int(self._data.size // self._row_size) # the number of rows to use
leftover = int(self._data.size % self._row_size) # the amount of data leftover
first_offset = self._data[0]
if leftover > 0:
# set temporary results to slice view of out parameter
out_main_view = np.reshape(self._out[:-leftover], (num_rows, self._row_size))
data_main_view = np.reshape(self._data[:-leftover], (num_rows, self._row_size))
else:
out_main_view = self._out.reshape(-1, self._row_size)
data_main_view = self._data.reshape(-1, self._row_size)
self._ewma_vectorized_2d(data_main_view, out_main_view) # get the scaled cumulative sums
scaling_factors = (1 - self._alpha) ** np.arange(1, self._row_size + 1)
last_scaling_factor = scaling_factors[-1]
# create offset array
offsets = np.empty(out_main_view.shape[0], dtype=self._dtype)
offsets[0] = first_offset
# iteratively calculate offset for each row
for i in range(1, out_main_view.shape[0]):
offsets[i] = offsets[i - 1] * last_scaling_factor + out_main_view[i - 1, -1]
# add the offsets to the result
out_main_view += offsets[:, np.newaxis] * scaling_factors[np.newaxis, :]
if leftover > 0:
# process trailing data in the 2nd slice of the out parameter
self._ewma_vectorized(self._data[-leftover:],
self._out[-leftover:],
offset=out_main_view[-1, -1])
def _ewma_vectorized(self,
data: np.ndarray,
out: np.ndarray,
offset: float | None = None) -> None:
""" Calculates the exponential moving average over a vector. Will fail for large inputs.
The result is processed in place into the array passed to the `out` parameter
Parameters
----------
data: :class:`numpy.ndarray`
A 1 dimensional numpy array to obtain smoothed data for
out: :class:`numpy.ndarray`
A location into which the result is stored. It must have the same shape and dtype as
the input data
offset: float, optional
The offset for the moving average, scalar. Default: the value held in data[0].
"""
if data.size < 1: # empty input, return empty array
return
offset = data[0] if offset is None else offset
# scaling_factors -> 0 as len(data) gets large. This leads to divide-by-zeros below
scaling_factors = np.power(1. - self._alpha, np.arange(data.size + 1, dtype=self._dtype),
dtype=self._dtype)
# create cumulative sum array
np.multiply(data, (self._alpha * scaling_factors[-2]) / scaling_factors[:-1],
dtype=self._dtype, out=out)
np.cumsum(out, dtype=self._dtype, out=out)
out /= scaling_factors[-2::-1] # cumulative sums / scaling
if offset != 0:
noffset = np.array(offset, copy=False).astype(self._dtype, copy=False)
out += noffset * scaling_factors[1:]
def _ewma_vectorized_2d(self, data: np.ndarray, out: np.ndarray) -> None:
""" Calculates the exponential moving average over the last axis.
The result is processed in place into the array passed to the `out` parameter
Parameters
----------
data: :class:`numpy.ndarray`
A 1 or 2 dimensional numpy array to obtain smoothed data for.
out: :class:`numpy.ndarray`
A location into which the result is stored. It must have the same shape and dtype as
the input data
"""
if data.size < 1: # empty input, return empty array
return
# calculate the moving average
scaling_factors = np.power(1. - self._alpha, np.arange(data.shape[1] + 1,
dtype=self._dtype),
dtype=self._dtype)
# create a scaled cumulative sum array
np.multiply(data,
np.multiply(self._alpha * scaling_factors[-2],
np.ones((data.shape[0], 1), dtype=self._dtype),
dtype=self._dtype) / scaling_factors[np.newaxis, :-1],
dtype=self._dtype, out=out)
np.cumsum(out, axis=1, dtype=self._dtype, out=out)
out /= scaling_factors[np.newaxis, -2::-1]