Module panama.analytics.formula_data_frame

Classes

class FormulaDataFrame (sdf: pyspark.sql.dataframe.DataFrame, formula_col: str, key_cols: Union[str, List[str], ForwardRef(None)] = None, date_cols: Union[str, List[str], ForwardRef(None)] = None)
Expand source code
class FormulaDataFrame(PanamaDataFrame):
    def __init__(
        self,
        sdf: DataFrame,
        formula_col: str,
        key_cols: Union[str, List[str], None] = None,
        date_cols: Union[str, List[str], None] = None,
    ):
        super().__init__(sdf, key_cols=key_cols, date_cols=date_cols)
        self.add_columns(key="formula_col", value=formula_col)
        # self.formula_col = formula_col

    def evaluate(self, *args: PanamaDataFrame, output_col: str = "unitary_amount") -> PanamaDataFrame:
        """Add a column to a spark dataframe computing the value of the formula column of self.
           The formula can be any valid expression on columns of variables.
           For performance purpose, use LinearFormulaDataFrame if the formula is linear. In general, be careful to use these function in case of many different formulas in formula col.

        Args:
           self (FormulaDataFrame): Formula dataframe containing the formula expressions
           args (PanamaDataFrame): PanamaDataFrame containing the variables values
           output_col (str, optional): name of the output_col containing the formula value

        Returns:
           formula_psdf: an object of the same class of variables, with the formula value column computed
        """
        # Join the two sdfs and define auxiliary variables
        formula_col = self.get_columns_by_name("formula")[0]
        formula_list = get_col_as_list(sdf=self.sdf, colname=formula_col, distinct=True, drop_null=True)
        extra_cols = []
        formula_psdf = self
        for var in args:
            formula_psdf = var.automatic_join(formula_psdf, how="right")
            extra_cols = list_value_append(extra_cols, var.get_extra_cols())

        formula_expression = [
            F.when(F.col(formula_col) == formula, F.expr(formula)).otherwise(F.lit(None)) for formula in formula_list
        ]

        formula_psdf = formula_psdf.add_time_series(
            col_to_add=output_col, serie_type=None, udm=None, col=F.coalesce(*formula_expression)
        )
        formula_psdf = formula_psdf.drop(*extra_cols)

        return formula_psdf

Ancestors

Subclasses

Methods

def evaluate(self, *args: PanamaDataFrame, output_col: str = 'unitary_amount') ‑> PanamaDataFrame

Add a column to a spark dataframe computing the value of the formula column of self. The formula can be any valid expression on columns of variables. For performance purpose, use LinearFormulaDataFrame if the formula is linear. In general, be careful to use these function in case of many different formulas in formula col.

Args

self : FormulaDataFrame
Formula dataframe containing the formula expressions
args : PanamaDataFrame
PanamaDataFrame containing the variables values
output_col : str, optional
name of the output_col containing the formula value

Returns

formula_psdf
an object of the same class of variables, with the formula value column computed

Inherited members

class LinearFormulaDataFrame (sdf, coefficients: List[Union[str, int, float]], factor_cols: List[str], key_cols: Union[str, List[str], ForwardRef(None)] = None, date_cols: Union[str, List[str], ForwardRef(None)] = None)
Expand source code
class LinearFormulaDataFrame(FormulaDataFrame):
    def __init__(
        self,
        sdf,
        coefficients: List[Union[str, int, float]],
        factor_cols: List[str],
        key_cols: Union[str, List[str], None] = None,
        date_cols: Union[str, List[str], None] = None,
    ):
        super().__init__(sdf, "formula", key_cols, date_cols)
        self.add_columns(key="factor_cols", value=factor_cols)

        self.coefficients = coefficients
        self.set_formula()

    def set_formula(self):
        """Add the linear formula expression to the self LinearFormulaDataFrame"""

        factor_cols = self.get_columns_by_name("factor_cols")
        factors_string = [
            F.concat_ws("*", F.col(coeff), F.col(colnam))
            if isinstance(coeff, str)
            else F.concat_ws("*", F.lit(coeff), F.col(colnam))
            for colnam, coeff in zip(factor_cols, self.coefficients)
        ]

        if len(self.coefficients) > len(factor_cols):
            kt = self.coefficients[-1]
            kt_string = F.col(kt) if isinstance(kt, str) else F.col(kt)  # type: ignore
        else:
            kt_string = F.lit(None)

        self.sdf = self.sdf.withColumn("formula", F.concat_ws("+", kt_string, *factors_string))

    def evaluate(
        self,
        *args: PanamaDataFrame,
        output_col: str = "unitary_amount",
    ) -> PanamaDataFrame:
        """Add a column to a spark dataframe computing the value of the formula column of self.
           The formula can be any valid expression on columns of variables.
           For performance purpose, use LinearFormulaDataFrame if the formula is linear. In general, be careful to use these function in case of many different formulas in formula col.

        Args:
           self (FormulaDataFrame): Formula dataframe containing the formula expressions
           *args (PanamaDataFrame): one or more PanamaDataFrame containing the variables values
           output_col (str, optional): name of the output_col containing the formula value

        Returns:
           formula_sdf: an object of the same class of variables, with the formula value column computed
        """
        # Join the two sdfs and define auxiliary variables
        extra_cols = []
        formula_psdf = self
        factor_cols = self.get_columns_by_name("factor_cols")
        for var in args:
            formula_psdf = var.automatic_join(formula_psdf, how="right")
            extra_cols = list_value_append(extra_cols, var.get_extra_cols())

        factors_value_name = [f + "_value" for f in factor_cols]
        # Evalute the value of the string factors
        for f, v in zip(factor_cols, factors_value_name):
            index_list = get_col_as_list(sdf=self.sdf, colname=f, distinct=True, drop_null=True)
            formula_psdf = formula_psdf.withColumn(
                v, F.coalesce(*[F.when(F.col(f) == i, F.col(i)).otherwise(F.lit(None)) for i in index_list])
            )
            formula_psdf = formula_psdf.withColumn(v, F.coalesce(F.col(v), F.lit(0)))

        # Compute the final linear combination
        formula_psdf = add_linear_combination(
            sdf=formula_psdf, coefficients=self.coefficients, colnames=factors_value_name, output_col=output_col  # type: ignore
        )
        formula_psdf = formula_psdf.add_time_series(col_to_add=output_col, serie_type=None, udm=None)  # type: ignore
        formula_psdf = formula_psdf.drop(*extra_cols, *factors_value_name)

        return formula_psdf

Ancestors

Methods

def evaluate(self, *args: PanamaDataFrame, output_col: str = 'unitary_amount') ‑> PanamaDataFrame

Add a column to a spark dataframe computing the value of the formula column of self. The formula can be any valid expression on columns of variables. For performance purpose, use LinearFormulaDataFrame if the formula is linear. In general, be careful to use these function in case of many different formulas in formula col.

Args

self : FormulaDataFrame
Formula dataframe containing the formula expressions
*args : PanamaDataFrame
one or more PanamaDataFrame containing the variables values
output_col : str, optional
name of the output_col containing the formula value

Returns

formula_sdf
an object of the same class of variables, with the formula value column computed
def set_formula(self)

Add the linear formula expression to the self LinearFormulaDataFrame

Inherited members