Source code for qpcr.main.Results

"""
This is the ``qpcr.Results`` class whose function is to accumulate results from various
``qpcr.Assay`` objects and summarize them.

Setting up a ``qpcr.Results`` object
====================================

Since the ``Results`` are supposed to be a central collection hub it makes sense to know how to make them.
The setup is fairly simple. The ``qpcr.Results`` already provide a number of methods to directly add specific data
such as Delta-Delta-Ct values to their dataframes from ``qpcr.Assay`` objects. However, they also allow more generic
data manipulation through normal item setting, getting, and deleting.

An important first step is usually to adopt the experimental meta-data shared by the Assays. 
This can be done using the ``setup_cols`` method which copies the ``id, group, and group_name`` columns from an Assay.
Once this is done, we can easily add more interesting data.

.. code-block:: python

    # initialize the Results
    result = Results()

    # make sure the metadata is present
    result.setup_cols( some_assay )

    # now copy actually interesting data
    # for example Delta-Delta-Ct values
    result.add_ddCt( some_assay )

    # now we can continue to assemble data
    # for instance with
    for assay in a_list_of_assays:
        result.add_ddCt( assay )

    # or directly
    # result.add_ddCt( a_list_of_assays )

    # and now summarize these
    result.stats()

    # and visualise
    result.preview()

Alternatively, we might wish to make use of of a ``Results`` object for data processing where we might want to assemble a set of Assays from different files into a single BigTable-like file.
For this we might only wish to store the Ct values and then save them to a new file.

.. code-block:: python

    # a list of many assays
    many_assays = [...]

    r = Results()
    r.setup_cols( many_assays[0] )
    r.add_Ct( many_assays )

    # and now save the accumulated file
    r.save( ... )

"""

import qpcr.defaults as defaults
import qpcr._auxiliary as aux
import qpcr._auxiliary.warnings as aw
import qpcr.main.Assay as Assay

# import qpcr.stats.Comparisons as Comparisons

import re
import pandas as pd
import numpy as np
from scipy.stats import sem, t
import os


logger = aux.default_logger()


[docs]class Results(aux._ID): """ Handles a pandas dataframe for data and computed results from a ``qpcr`` class. Note ----- This is a central data collection that can inherit directly from ``qpcr.Assay`` objects and from externally computed sources. Please, note that it will not perform extensive vetting on its data input, so make sure to only provide proper data input when manually assembling your ``qpcr.Results``! """ __slots__ = ["_df", "_stats_df", "_id", "_rel_cols", "_comparisons"] def __init__(self, id: str = None): super().__init__() if id is not None: self.id(id) self._df = pd.DataFrame() self._stats_df = pd.DataFrame() self._rel_cols = None self._comparisons = None
[docs] def get(self): """ Returns ------- data : pd.DataFrame The Results dataframe """ return self._df
[docs] def add_Ct(self, assay: Assay): """ Adds a `"Ct"` column with Delta-Ct values from an ``qpcr.Assay``. It will store these as a new column using the Assay's ``id`` as header. Parameters ------- assay : qpcr.Assay An ``qpcr.Assay`` object from which to import. """ if isinstance(assay, list): [self.add_Ct(i) for i in assay] return if self.is_empty: self.setup_cols(assay) self.add(assay.Ct)
[docs] def add_dCt(self, assay: Assay): """ Adds a `"dCt"` column with Delta-Ct values from an ``qpcr.Assay``. It will store these as a new column using the Assay's ``id`` as header. Parameters ------- assay : qpcr.Assay An ``qpcr.Assay`` object from which to import. """ if isinstance(assay, list): [self.add_dCt(i) for i in assay] return if self.is_empty: self.setup_cols(assay) self.add(assay.dCt)
[docs] def add_ddCt(self, assay: Assay): """ Adds all `"rel_{}"` columns with Delta-Delta-Ct values from an ``qpcr.Assay``. It will store these as new columns using the Assay's ``id`` + the ``_rel_{}`` composite id. Parameters ------- assay : qpcr.Assay An ``qpcr.Assay`` object from which to import. """ if isinstance(assay, list): [self.add_ddCt(i) for i in assay] return if self.is_empty: self.setup_cols(assay) self.add(assay.ddCt)
[docs] def add(self, data: (pd.Series or pd.DataFrame), replace: bool = False): """ Adds some new datacolumn. Note ---- The ``column`` argument has to be named for this to work. However, there are already implemented methods dedicated to adding specifically Delta-Ct, Delta-Delta-Ct or just Ct values to the Results. In order to add a generic column from a numpy array or some other iterable just use default item setting (e.g. `results["new column"] = [1,2,3,4]`). Parameters ---------- data : pd.Series or pd.DataFrame A named pandas Series or DataFrame that can be joined into the already stored dataframe. Note, a DataFrame may contain multiple columns. replace : bool In case results from a computation with the same identifiers are already stored no new data can be stored under that id. Either the new data must be renamed or ``replace = True`` must be set to overwrite the presently stored data. """ if isinstance(data, pd.Series): if data.name in self._df.columns: if not replace: e = aw.ResultsError("name_overlap", name=data.name) logger.error(e) return self._df[data.name] = data # else: # self._df = self._df.join(data) elif isinstance(data, pd.DataFrame): new = data.columns.unique() current = self._df.columns.unique() to_add = new if not replace: to_add = set(new) ^ set(current) # this line preserves the original order which is lost by the set() to_add = [i for i in new if i in to_add] if len(to_add) != len(new): logger.info(f"Excluding {tuple(new.intersection(current))} due to name overlap. Use replace=True to force replacement.") to_add = list(to_add) self._df[to_add] = data[to_add] return self
[docs] def merge(self, *Results, all_cols: bool = False): """ Merge any number of ``qpcr.Results`` objects into this one. The same can be achieved using the + operator. Note ----- This operation will merge the columns of the Results' dataframes! Parameters ---------- *Results An arbitrary number of ``qpcr.Results`` objects. all_cols : bool Set to ``True`` to merge not only the Delta-Delta-Ct columns (_rel_ columns) but also any additional columns. """ new_df = self._df.copy() for result in Results: df = result.get() if not all_cols: df = df[[i for i in result.columns if i not in defaults.setup_cols]] # we merge the dataframes first without adding # some new id suffix, only do so if this fails try: # check if we have an overlap of column names intersect = set(df.columns).intersection(set(new_df.columns)) if intersect != set(): raise IndexError(f"Duplicate column names were found: {intersect}") new_df = pd.merge( new_df, df, right_index=True, left_index=True, ) except aw.ClassError as e: new_df = pd.merge(new_df, df, right_index=True, left_index=True, suffixes=[f"_{self.id()}", f"_{result.id()}"]) logger.warning(e) except Exception as e: raise e self._df = new_df return self
[docs] def rename(self, cols: dict): """ Renames columns according to a dictionary as key -> value. This is the same as calling ``Results.rename_cols``. Parameters ---------- cols : dict A dictionary specifying old column names (keys) and new colums names (values). """ self.rename_cols(cols) return self
[docs] def rename_cols(self, cols: dict): """ Renames columns according to a dictionary as key -> value. This is the same as calling ``Results.rename``. Parameters ---------- cols : dict A dictionary specifying old column names (keys) and new colums names (values). """ self._df = self._df.rename(columns=cols)
[docs] def drop(self, *cols): """ Drops all specified columns from the dataframe. This is used for normaliser pre-processing. This is the same as calling ``Results.drop_cols``. Parameters ---------- *cols Any column names (as ``str``) to be dropped. """ self.drop_cols(*cols) return self
[docs] def drop_cols(self, *cols): """ Drops all specified columns from the dataframe. This is used for normaliser pre-processing. This is the same as calling ``Results.drop``. Parameters ---------- *cols Any column names (as ``str``) to be dropped. """ for c in cols: del self[c]
[docs] def setup_cols(self, obj: (Assay or pd.DataFrame)): """ Adopts the setup columns: ``id, group, group_name`` from another object. Parameters ------- obj qpcr.Assay or qpcr.Results or pd.DataFrame Either a ``qpcr.Assay`` or a ``qpcr.Results`` or a pandas DataFrame that has the given columns. """ self["id"] = obj["id"] self["group"] = obj["group"] self["group_name"] = obj["group_name"]
[docs] def names(self, as_set=True): """ Parameters ---------- as_set : bool If ``as_set = True`` (default) it returns a set (as list without duplicates) of assigned group names for replicate groups. If ``as_set = False`` it returns the full group_name column (including all repeated entries). Returns ------- names : list or None The adopted ``group_names`` (only works if a ``qpcr.Assay`` has already been linked using ``adopt_names()``!) """ if as_set: names = list(self._df["group_name"].unique()) return names
[docs] def groups(self, as_set=True): """ Parameters ---------- as_set : bool If ``as_set = True`` (default) it returns a set (as list without duplicates) of assigned group names for replicate groups. If ``as_set = False`` it returns the full group column (including all repeated entries). Returns ------- groups : list The given numeric group identifiers of all replicate groups. """ groups = list(self._df["group"].unique()) if as_set else self._df["group"] return groups
[docs] def drop_groups(self, groups: (list or str or int)): """ Removes specific groups of replicates from the DataFrame. Parameters ---------- groups : list Either the numeric group identifiers or the group name, or an iterable thereof, of the groups to be removed, or a ``regex`` pattern defining which groups should be dropped (this is useful for systematically removing RT- groups etc.) A ``regex pattern`` can be supplied as well to match multiple group names. """ # check for regex pattern # and get corresponding group names if isinstance(groups, str): groups = [i for i in self._df["group_name"] if re.match(groups, i) is not None] elif isinstance(groups, int): groups = [groups] # get the right reference column and query to use to be # used (either group or group_name) ref_query = "group != {group}" if isinstance(groups[0], int) else "group_name != '{group}'" # remove groups from dataset for group in groups: self._df = self._df.query(ref_query.format(group=group)) # also drop from stats df if not len(self._stats_df) == 0: self._stats_df = self._stats_df.query(ref_query.format(group=group))
[docs] def drop_rel(self): """ Crops the ``X_rel_Y`` column-names of Delta-Delta-Ct results to just ``X``. I.e. reduces back to the assay-of-interest name only. """ # first store the current _rel_ cols for ddCt_col to_change = {i: i.split("_rel_")[0] for i in self._df.columns if "_rel_" in i} self._rel_cols = list(to_change.values()) self.rename_cols(to_change) # also recompute the stats df with new names... if not len(self._stats_df) == 0: self.stats(recompute=True) # also adjust the comparisons keys if we have any if aux.pseudo_isinstance(self._comparisons, "ComparisonsCollection"): for i in self._comparisons: if "_rel_" in i.id(): i.id(i.id().split("_rel_")[0]) if "_rel_" in i.labels[0][0]: i.labels = i._set_labels(i.pvalues, [j.split("_rel_")[0] for j in i.labels[0]]) self._comparisons._dict = {i.id(): i for i in self._comparisons.comparisons}
[docs] def stats(self, recompute=False, iqr_limits: tuple = None, ci_level: float = 0.95): """ Computes summary statistis about the replicate groups: - ``N (count)`` - ``Mean`` - ``Median`` - ``StDev`` - ``IQR`` - ``CI`` of all replicate groups, for all datasets (assays). Parameters ---------- recompute : bool Statistics will only be once unless recompute is set to ``True``. The same dataframe can be directly accessed via this method once is has been computed. iqr_limits : tuple The lower and upper quantiles for the IQR computation. By default ``(0.25, 0.75)`` Returns ------- stats_df : pd.DataFrame A new dataframe containing the computed statistics for each replicate group. """ iqr_limits = (0.25, 0.75) if iqr_limits is None else iqr_limits _stats = { "mean": lambda x: np.nanmean(x, axis=0), "stdev": lambda x: np.nanstd(x, axis=0), "median": lambda x: np.nanmedian(x, axis=0), f"IQR_{iqr_limits}": lambda x: np.nanquantile(x, iqr_limits[1], axis=0) - np.nanquantile(x, iqr_limits[0], axis=0), f"CI_{ci_level}": lambda x: [i for i in np.array(t.interval(ci_level, len(x) - 1, loc=np.nanmean(x, axis=0), scale=sem(x, nan_policy="omit"))).transpose()], } # if stats_df is already present, return but sorted according to assays, not groups (nicer for user to inspect) if not len(self._stats_df) == 0 and not recompute: return self._stats_df self._stats_df = pd.DataFrame() for group, name in zip(self.groups(), self.names()): subset = self._df.query(f"group == {group}") _subset = subset.drop(columns=defaults.setup_cols, errors="ignore") logger.debug(_subset) # setup a stats dataframe with the right columns (in the right order) _stat = pd.DataFrame(columns=["group", "group_name", "assay", "n"] + list(_stats.keys())) # compute all statistics for label, func in _stats.items(): s = func(_subset) logger.debug(f"{label}: {s}") _stat[label] = s # fill in with groups and group names and assay identifiers in the right length _stat["group"] = group _stat["group_name"] = name _stat[defaults.dataset_header] = _subset.columns _stat["n"] = len(_subset) # and add to the stats dataframe self._stats_df = pd.concat((self._stats_df, _stat), ignore_index=True) self._stats_df = self._stats_df.sort_values(defaults.dataset_header) return self._stats_df
[docs] def save(self, path, df=True, stats=True): """ Saves a csv file for each specified type of results. Parameters ---------- path : str Path has to be a filepath if only one type of results shall be saved (i.e. either ``df`` or ``stats``), otherwise a path to the directory where both ``df`` and ``stats`` shall be saved. df : bool Save the results dataframe containing all replicate values (the full results). Default is ``df = True``. stats : bool Save the results dataframe containing summary statistics for all replicate groups. Default is ``stats = True``. """ if df and stats and not os.path.isdir(path): e = aw.ResultsError("save_need_dir") logger.error(e) raise e if df: # in case of raw results export we don't need the "assay" column as all # assays are stored as separate columns anyaway, so it doesn't store any useful data _df = self._df if "assay" in _df.columns: _df = self._df.drop(columns=["assay"]) self._save_single(path, _df, "_df") if stats: # compute stats if none have been computed yet... if len(self._stats_df) == 0: self.stats() self._save_single(path, self._stats_df, "_stats")
[docs] def preview(self, kind: str = None, mode: str = None, **kwargs): """ A shortcut to call on a ``qpcr.Plotters.PreviewResults`` wrapper to visualise the results. Parameters ---------- kind : str The kind of Plotter to call. This can be any of the four wrapped Plotters, e.g. `kind = "GroupBars"`. By default this will be `"AssayBars"`. mode : str The plotting mode. May be either "static" (matplotlib) or "interactive" (plotly). Returns ------- fig : plt.figure or plotly.figure The figure generated by ``PreviewResults``. """ import qpcr.Plotters as Plotters if kind is None: kind = defaults.default_preview preview_results = Plotters.PreviewResults(mode=mode, kind=kind) preview_results.params(**kwargs) preview_results.link(self) fig = preview_results.plot() return fig
[docs] def add_comparisons(self, comp): """ Add a results from a statistical evaluation of the stored `Results` in the form of a `Comparison` object. Parameters ---------- comp Either a `Comparison` or `ComparisonsCollection` object. """ self._comparisons = comp
@property def comparisons(self): """ Returns a `Comparison` object storing the results of statistical analysis that were performed (if any). """ return self._comparisons @property def columns(self): return self._df.columns @property def ddCt_cols(self): """ Returns ------- cols A list of all {}_rel_{} columns within the Results's dataframe. Or their new names if `drop_rel` was performed. """ if self._rel_cols is not None: return self._rel_cols return [i for i in self._df.columns if "_rel_" in i] @property def data_cols(self): """ Returns ------- cols A list of all non-setup columns in the dataframe. """ return [i for i in self._df.columns if not i in defaults.setup_cols] @property def is_empty(self): """ Checks if any results have been stored so far. Returns ------- bool ``True`` if NO data is yet stored, else ``False``. """ return len(self) == 0 def _save_single(self, path, src, suffix=""): """ Saves either self._df or self._stats_df to a csv file based on a path (path can be either filename or directory) """ filename = path if not os.path.isdir(path) else os.path.join(path, f"rel_{self.id()}{suffix}.csv") src.to_csv(filename, index=False) # def _has_pairwise_comparisons(self): # """ # Checks if the `Results` object has any pairwise comparisons (returns True if so). # """ # if self._comparisons is not None: # if isinstance( self._comparisons, Comparisons.ComparisonsCollection ): # if isinstance( self._comparisons[0], Comparisons.PairwiseComparison ): # return True # elif isinstance( self._comparisons, Comparisons.PairwiseComparison ): # return True # return False def __qplot__(self, **kwargs): return self.preview def __setitem__(self, key, value): self._df[key] = value def __getitem__(self, key): if isinstance(key, (list, tuple)) or key in self._df.columns: return self._df[key] if key in self._stats_df.columns: return self._stats_df[key] def __delitem__(self, key): if key in self._df.columns: del self._df[key] if key in self._stats_df.columns: del self._stats_df[key] def __add__(self, other): self.merge(other) return self def __str__(self): _length = len(str(self._df).split("\n")[0]) s = f""" {"-" * _length} {self._df} {"-" * _length} """.strip() if self.id_was_set(): s = f"{'-' * _length}\n{self.__class__.__name__}: {self._id}\n{s}" return s def __repr__(self): id = self._id data = self.ddCt_cols return f"{self.__class__.__name__}({id=}, {data=})" def __len__(self): return len(self._df) def __iter__(self): return (self._df[defaults.setup_cols + [i]] for i in self.data_cols)