From 6abc5d2075c9f31f80c54fc5b2f6ce4164414747 Mon Sep 17 00:00:00 2001 From: Aleksandra Apolinarska <aapolina@ethz.ch> Date: Wed, 5 Mar 2025 15:47:45 +0100 Subject: [PATCH 1/2] #192 first draft: parallel coordinates plot --- src/aixd/visualisation/plotter.py | 41 +++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/aixd/visualisation/plotter.py b/src/aixd/visualisation/plotter.py index 89c30535..c338d791 100644 --- a/src/aixd/visualisation/plotter.py +++ b/src/aixd/visualisation/plotter.py @@ -1336,6 +1336,47 @@ class Plotter: return self._output(fig, output_name=output_name or f"PerformanceSummary_{block.name}") + def plot_parallel_coordinates( + self, + data: pd.DataFrame, + value_ranges: dict[str, Tuple[float, float]], + color_by: Optional[str] = None, + output_name: Optional[str] = None, + ): + """ + Parallel coordinates plot for the given data. + + Parameters + ---------- + data: [:class:`pd.DataFrame`] + Dataframe containing data to plot. The order of the parallel coordinates corresponds to the order of columns. + value_ranges: dict[str, Tuple[float, float]] + Dictionary with column names as keys and [min,max] as values. + color_by: str, optional + Name of the variable to color the lines by. If not specified, default color (grey) is used for all samples. + output_name : str, optional, default=None + Name of the output file. If None, the name is automatically generated from the data block name. + + Returns + ------- + Optional[:class:`plotly.graph_objects.Figure`] + Plotly figure object, if self.output is None, otherwise None. + """ + # TODO: accommodate categorical variables + # TODO: add transformed flag + # TODO: add flag to make value_ranges optional, and derive defaults from dataset/datamodule, or from given data + # TODO: separate function for dataset and separate for generated samples? + # TODO: add "variables" argument so that user can supply a whole df and specify which columns to plot? + + if color_by is not None: + cmin, cmax = value_ranges[color_by] + linesettings = dict(color=data[color_by], colorscale=px.colors.sequential.Turbo, cmin=cmin, cmax=cmax) + else: + linesettings = dict(color="grey") + + fig = go.Figure(data=go.Parcoords(line=linesettings, dimensions=list([dict(range=value_ranges[col], label=col, values=data[col].tolist()) for col in data.columns]))) + return self._output(fig, output_name=output_name or "ParallelCoordinates") + @staticmethod def _open_fig(size: Tuple[int, int] = (1, 1), **kwargs) -> go.Figure: """Helper method to open a figure with the desired number of rows and columns.""" -- GitLab From 09377bdf534914e2e4e4bc37c083e80075143157 Mon Sep 17 00:00:00 2001 From: Aleksandra Apolinarska <aapolina@ethz.ch> Date: Thu, 6 Mar 2025 16:49:58 +0100 Subject: [PATCH 2/2] #192 added handling of multidim and categorical/bool variables --- src/aixd/data/data_objects.py | 1 + src/aixd/visualisation/plotter.py | 98 +++++++++++++++++++++++++++---- 2 files changed, 88 insertions(+), 11 deletions(-) diff --git a/src/aixd/data/data_objects.py b/src/aixd/data/data_objects.py index 48083301..d7538793 100644 --- a/src/aixd/data/data_objects.py +++ b/src/aixd/data/data_objects.py @@ -1012,6 +1012,7 @@ class DataBool(DataCategorical): def __init__(self, name: str, **kwargs): domain = kwargs.pop("domain", dd.Options(["True", "False"], type="categorical")) super().__init__(name=name, domain=domain, **kwargs) + self.type = "bool" class DataOrdinal(DataDiscrete): diff --git a/src/aixd/visualisation/plotter.py b/src/aixd/visualisation/plotter.py index c338d791..420480c6 100644 --- a/src/aixd/visualisation/plotter.py +++ b/src/aixd/visualisation/plotter.py @@ -1336,10 +1336,11 @@ class Plotter: return self._output(fig, output_name=output_name or f"PerformanceSummary_{block.name}") - def plot_parallel_coordinates( + def parallel_coordinates( self, data: pd.DataFrame, - value_ranges: dict[str, Tuple[float, float]], + variables: Optional[List[str]] = None, + value_ranges: Optional[dict[str, Tuple[float, float]]] = None, color_by: Optional[str] = None, output_name: Optional[str] = None, ): @@ -1349,9 +1350,14 @@ class Plotter: Parameters ---------- data: [:class:`pd.DataFrame`] - Dataframe containing data to plot. The order of the parallel coordinates corresponds to the order of columns. - value_ranges: dict[str, Tuple[float, float]] - Dictionary with column names as keys and [min,max] as values. + Dataframe containing data to plot. + This could be data from the dataset or new samples generated by the model. + The order of the parallel coordinates corresponds to the order of columns. + variables: List[str], optional + List of variable names to plot (a selection from the data given). If not specified, all columns are plotted. + value_ranges: dict[str, Tuple[float, float]], optional + Dictionary with column names (or variable names, if `variables` are given) as keys and [min,max] as values. + If not specified, the ranges are derived from the dataset (if `variables` are given), or from the provided data (if otherwise). color_by: str, optional Name of the variable to color the lines by. If not specified, default color (grey) is used for all samples. output_name : str, optional, default=None @@ -1362,19 +1368,89 @@ class Plotter: Optional[:class:`plotly.graph_objects.Figure`] Plotly figure object, if self.output is None, otherwise None. """ - # TODO: accommodate categorical variables # TODO: add transformed flag - # TODO: add flag to make value_ranges optional, and derive defaults from dataset/datamodule, or from given data # TODO: separate function for dataset and separate for generated samples? - # TODO: add "variables" argument so that user can supply a whole df and specify which columns to plot? + + # if data is real or int --> calc value ranges + # if data is bool or cat --> set tickvals and ticktext + # need to create a dummy variable: + # https://stackoverflow.com/questions/64139316/plotly-how-to-insert-a-categorical-variable-into-a-parallel-coordinates-plot + + df = data + + all_parcoorddimensions = [] + linesettings = None + + if variables: + """ + If variable names are given, some settings are derived from the data objects via Dataset. + """ + all_dobjs = self.dataset.get_data_objects_by_name(variables) + all_column_names = list(chain(*[dobj.columns_df for dobj in all_dobjs])) + for dobj in all_dobjs: + if color_by is not None and dobj.name == color_by: + if dobj.dim > 1: + raise ValueError(f"Cannot color by multi-dimensional attribute {dobj.name}.") + + for colname in dobj.columns_df: + if dobj.type in ["real", "integer"]: + if value_ranges is not None: + vrange = value_ranges[colname] + else: + vrange = [dobj.domain.min_value, dobj.domain.max_value] + parcoorddimension = dict(range=vrange, label=colname, values=df[colname].tolist()) + if color_by is not None and dobj.name == color_by: + linesettings = dict(color=df[color_by], colorscale=color_divergent_centered, cmin=vrange[0], cmax=vrange[1]) + elif dobj.type in ["bool", "categorical"]: + options = [str(val) for val in dobj.domain.array] + options.sort() + dummyindex = {opt: idx for idx, opt in enumerate(options)} + parcoorddimension = dict( + tickvals=list(dummyindex.values()), ticktext=list(dummyindex.keys()), label=colname, values=[dummyindex[str(val)] for val in df[colname].tolist()] + ) + else: + raise ValueError(f"DataObject type {dobj.type} not supported for parallel coordinates plot.") + all_parcoorddimensions.append(parcoorddimension) + + else: + """ + If variable names are not given, some settings are derived from the dataframe directly. + """ + all_column_names = df.columns + df = df.infer_objects() + for colname in all_column_names: + if df[colname].dtype in [ + "int64", + "float64", + "float32", + "int32", + ]: + if value_ranges is not None: + vrange = value_ranges[colname] + else: + vrange = [df[colname].min(), df[colname].max()] + parcoorddimension = dict(range=vrange, label=colname, values=df[colname].tolist()) + if color_by is not None and colname == color_by: + linesettings = dict(color=df[color_by], colorscale=color_divergent_centered, cmin=vrange[0], cmax=vrange[1]) + elif df[colname].dtype in ["str", "object", "bool"]: + options = [str(val) for val in df[colname].unique()] + options.sort() + dummyindex = {opt: idx for idx, opt in enumerate(options)} + parcoorddimension = dict( + tickvals=list(dummyindex.values()), ticktext=list(dummyindex.keys()), label=colname, values=[dummyindex[str(val)] for val in df[colname].tolist()] + ) + else: + raise ValueError(f"Data type {df[colname].dtype} not supported for parallel coordinates plot.") + all_parcoorddimensions.append(parcoorddimension) if color_by is not None: - cmin, cmax = value_ranges[color_by] - linesettings = dict(color=data[color_by], colorscale=px.colors.sequential.Turbo, cmin=cmin, cmax=cmax) + if linesettings is None: + # linesettings not set yet --> it is a categorical/bool variable + linesettings = dict(color=df[color_by], colorscale=color_qualitative10) else: linesettings = dict(color="grey") - fig = go.Figure(data=go.Parcoords(line=linesettings, dimensions=list([dict(range=value_ranges[col], label=col, values=data[col].tolist()) for col in data.columns]))) + fig = go.Figure(data=go.Parcoords(line=linesettings, dimensions=all_parcoorddimensions)) return self._output(fig, output_name=output_name or "ParallelCoordinates") @staticmethod -- GitLab