Source code for pemtk.fit._conv

# PEMtk, converters for fit results > Pandas
#
# 15/06/21  v1


import pandas as pd
import numpy as np
from epsproc import multiDimXrToPD

# TODO: should be able to simplify & automate with results.__dict__  - May not be comprehensive?
# NOW REINDEX BY FIT # & Type, this makes sense for wide <> long conversions
# For stacked case, set as tuple dictionary index
[docs]def pdConv(self, fitVars = ['success', 'chisqr', 'redchi'], paramVars = ['value', 'stderr', 'vary', 'expr'],
            dataRange = None, batches = None):
    """
    Basic conversion for set of fit results > Pandas, long format.

    Extract fit and parameter results from lmFit objects and stack to PD dataframe.


    Parameters
    ----------

    fitVars : optional, list, default = ['success', 'chisqr', 'redchi']
        Values to extract from lmfit result object (per fit).

    paramVars : optional, list, default = ['value', 'stderr', 'vary', 'expr']
        Values to extract from lmfit params object (per parameter per fit).

    dataRange : optional, list, default = None
        Range of indexes to use, defaults to [0, self.fitInd].

    batches : optional, int, default = None
        Additional batch of labelling for fits.
        - If int, label as ceil(fit #)/batches. E.g. batches = 100 will label fits per 100.
        - If list, use as labels per fit. (NOT YET IMPLEMENTED)

    Todo

    - Additional batching options, inc. by file for multiple read case.

    13/07/22: Added type checking and casting, this seems to be an issue now/sometimes (PD version?) - currently defaulting all types to 'object' in testing, although was working previously!

    """

    # Set default indexes
    # if dataRange is None:
    #     dataRange = [0, self.fitInd]
    dataRange = self._setDefaultFits(dataRange)

    # Set vars
    dataDict = {}
    # variables = ['value', 'stderr']  #  Per parameter values.
    # fitVars = ['success', 'chisqr', 'redchi'] # These are per fit, not per param - should handle separately?

    outputIndex = ['Fit','Type','pn']  # Cols in output PD dataframe

    # Extract relevant data from lmfit params class objects & reindex
    for fitInd in range(dataRange[0], dataRange[1]):
        try:
            # Get fit vars
            fitDict = {k:getattr(self.data[fitInd]['results'], k) for k in fitVars}

            for n,i in enumerate(self.data[fitInd]['results'].params.items()):
        #     for n,i in enumerate(data.result.params.items()):
            #     print(n,i)

                # 13/07/22: Get types for first case, may be needed later!
                if n==0:
                    dtypes = {j:type(getattr(i[1],j)) for j in paramVars}

                # Get data
                pmType = i[0][0]
                dataDict[(fitInd, pmType, n)] = {j:getattr(i[1],j) for j in paramVars}
                dataDict[(fitInd, pmType, n)]['Param'] = i[0][2:]  # Use name + type for easier plotting later?
            #     dataDict[(fitInd, n)]['Type'] = i[0][0]
            #     dataDict[n]['Fit'] = fitInd  # As column

                dataDict[(fitInd, pmType, n)].update(fitDict)  # Add per fit items

        except KeyError as e:
            if self.verbose['sub']:
                print(f"*** Missing fit key {n}")

    # Stack to long-format PD
    dfLong = pd.DataFrame(dataDict).T
    # dfLong = pd.DataFrame.from_dict(dataDict).T  # Same result

    # 13/07/22: cast data types if required (columns only).
    # Sometimes these are all cast to generic 'object' types (not sure why, PD version maybe?), which causes issues later.
    for k,v in dtypes.items():
        if dfLong[k].dtype != v:
            dfLong[k] = dfLong[k].astype(v)


    # Set index names
    dfLong.index.names = outputIndex

    # Set dType attrib for later.
    dfLong.attrs['dType'] = 'Params Long'

    # Set batches if specified
    if batches:
        if isinstance(batches,int):
            dfLong['batch'] = np.ceil((dfLong.index.get_level_values('Fit')+1)/(batches)).astype(int)  # Quick category for batch of fits
            # outputIndex.append('batch')

        # TODO: implement some other options here!

        else:
            print(f'** Batch type {type(batches)} not yet supported, skipping batching in pdConv() routine.')

    # Set ref values too, if present
    if hasattr(self,'params'):
        # for n,i in enumerate(self.params.items()):
        # #     print(n,i)
        #
        #     pmType = i[0][0]
        #     refDataDict[('ref', pmType, n)] = {j:getattr(i[1],j) for j in paramVars}
        #     refDataDict[('ref', pmType, n)]['Param'] = i[0][2:]  # Use name + type for easier plotting later?
        # #     dataDict[(fitInd, n)]['Type'] = i[0][0]
        # #     dataDict[n]['Fit'] = fitInd  # As column
        #
        # # Stack to long-format PD
        # dfRef = pd.DataFrame(refDataDict).T
        # dfRef.index.names = outputIndex
        # dfRef.attrs['dType'] = 'Params Ref'

        dfRef = self.pdConvRef(paramVars, outputIndex)  # Functionalised version

    else:
        dfRef = None
        print("Pandas reference table not set, missing self.params data.")


    return(dfLong, dfRef)


[docs]def pdConvRef(self, paramVars = ['value'], outputIndex = ['Fit','Type','pn']):
    """
    Convert reference params set to reference PD table.

    Basic routine stripped from main pdConv() method for reuse elsewhere.

    TODO: add flexibility here.

    13/07/22: Added type checking and casting, this seems to be an issue now/sometimes (PD version?) - currently defaulting all types to 'object' in testing, although was working previously!

    """

    # If params are missing, try to set them
    if not hasattr(self,'params'):
        # Check if set in subset (only in code as of 29/11/21)
        try:
            self.params = self.data[self.subKey]['params']
            self.lmmu = self.data[self.subKey]['lmmu']

        except:
            print(f"self.params not set, setting ref values from self.data[{self.subKey}]['matE'] without constraints.")
            self.setMatEFit(paramsCons = {})

    refDataDict = {}

    for n,i in enumerate(self.params.items()):
    #     print(n,i)

        # 13/07/22: Get types for first case, may be needed later!
        if n==0:
            dtypes = {j:type(getattr(i[1],j)) for j in paramVars}

        pmType = i[0][0]
        refDataDict[('ref', pmType, n)] = {j:getattr(i[1],j) for j in paramVars}
        refDataDict[('ref', pmType, n)]['Param'] = i[0][2:]  # Use name + type for easier plotting later?
    #     dataDict[(fitInd, n)]['Type'] = i[0][0]
    #     dataDict[n]['Fit'] = fitInd  # As column

    # Stack to long-format PD
    dfRef = pd.DataFrame(refDataDict).T
    dfRef.index.names = outputIndex
    dfRef.attrs['dType'] = 'Params Ref'

    # 13/07/22: cast data types if required (columns only).
    # Sometimes these are all cast to generic 'object' types (not sure why, PD version maybe?), which causes issues later.
    for k,v in dtypes.items():
        if dfRef[k].dtype != v:
            dfRef[k] = dfRef[k].astype(v)

    return dfRef


[docs]def pdConvSetFit(self, matE, colDim = 'it'):
    """
    Restack matE to pd.DataFrame and force to 1D.

    Utility function for setting up fit parameter sets.

    """

    # Using PD conversion routine works, although may have issues with singleton dims again - should set suitable dummy dim here?
    # pdTest, _ = ep.multiDimXrToPD(testMatE, colDims='Eke', dropna=True)
    pdTest, _ = multiDimXrToPD(matE, colDims=colDim, dropna=True, squeeze = False)
    # pdTest, _ = ep.multiDimXrToPD(testMatE, colDims='Sym', dropna=True, squeeze = False)

    pdTest = pd.DataFrame(pdTest.stack(colDim))  # Stack to 1D format and force to DF

    return pdTest