Module `panama.analytics.formula_data_frame`

Classes

class FormulaDataFrame (sdf: pyspark.sql.dataframe.DataFrame, formula_col: str, key_cols: str | List[str] | None = None, date_cols: str | List[str] | None = None)

Expand source code

class FormulaDataFrame(PanamaDataFrame):
    def __init__(
        self,
        sdf: DataFrame,
        formula_col: str,
        key_cols: Union[str, List[str], None] = None,
        date_cols: Union[str, List[str], None] = None,
    ):
        super().__init__(sdf, key_cols=key_cols, date_cols=date_cols)
        self.add_columns(key="formula_col", value=formula_col)
        # self.formula_col = formula_col

    def evaluate(self, *args: PanamaDataFrame, output_col: str = "unitary_amount") -> PanamaDataFrame:
        """Add a column to a spark dataframe computing the value of the formula column of self.
           The formula can be any valid expression on columns of variables.
           For performance purpose, use LinearFormulaDataFrame if the formula is linear. In general, be careful to use these function in case of many different formulas in formula col.

        Args:
           self (FormulaDataFrame): Formula dataframe containing the formula expressions
           args (PanamaDataFrame): PanamaDataFrame containing the variables values
           output_col (str, optional): name of the output_col containing the formula value

        Returns:
           formula_psdf: an object of the same class of variables, with the formula value column computed
        """
        # Join the two sdfs and define auxiliary variables
        formula_col = self.get_columns_by_name("formula")[0]
        formula_list = get_col_as_list(sdf=self.sdf, colname=formula_col, distinct=True, drop_null=True)
        extra_cols = []
        formula_psdf = self
        for var in args:
            formula_psdf = var.automatic_join(formula_psdf, how="right")
            extra_cols = list_value_append(extra_cols, var.get_extra_cols())

        formula_expression = [
            F.when(F.col(formula_col) == formula, F.expr(formula)).otherwise(F.lit(None)) for formula in formula_list
        ]

        formula_psdf = formula_psdf.add_time_series(
            col_to_add=output_col, serie_type=None, udm=None, col=F.coalesce(*formula_expression)
        )
        formula_psdf = formula_psdf.drop(*extra_cols)

        return formula_psdf

Ancestors

Subclasses

LinearFormulaDataFrame

Methods

def evaluate(self, *args: PanamaDataFrame, output_col: str = 'unitary_amount') ‑> PanamaDataFrame

Expand source code

def evaluate(self, *args: PanamaDataFrame, output_col: str = "unitary_amount") -> PanamaDataFrame:
    """Add a column to a spark dataframe computing the value of the formula column of self.
       The formula can be any valid expression on columns of variables.
       For performance purpose, use LinearFormulaDataFrame if the formula is linear. In general, be careful to use these function in case of many different formulas in formula col.

    Args:
       self (FormulaDataFrame): Formula dataframe containing the formula expressions
       args (PanamaDataFrame): PanamaDataFrame containing the variables values
       output_col (str, optional): name of the output_col containing the formula value

    Returns:
       formula_psdf: an object of the same class of variables, with the formula value column computed
    """
    # Join the two sdfs and define auxiliary variables
    formula_col = self.get_columns_by_name("formula")[0]
    formula_list = get_col_as_list(sdf=self.sdf, colname=formula_col, distinct=True, drop_null=True)
    extra_cols = []
    formula_psdf = self
    for var in args:
        formula_psdf = var.automatic_join(formula_psdf, how="right")
        extra_cols = list_value_append(extra_cols, var.get_extra_cols())

    formula_expression = [
        F.when(F.col(formula_col) == formula, F.expr(formula)).otherwise(F.lit(None)) for formula in formula_list
    ]

    formula_psdf = formula_psdf.add_time_series(
        col_to_add=output_col, serie_type=None, udm=None, col=F.coalesce(*formula_expression)
    )
    formula_psdf = formula_psdf.drop(*extra_cols)

    return formula_psdf

Add a column to a spark dataframe computing the value of the formula column of self. The formula can be any valid expression on columns of variables. For performance purpose, use LinearFormulaDataFrame if the formula is linear. In general, be careful to use these function in case of many different formulas in formula col.

Args

self : FormulaDataFrame: Formula dataframe containing the formula expressions
args : PanamaDataFrame: PanamaDataFrame containing the variables values
output_col : str, optional: name of the output_col containing the formula value

Returns

formula_psdf: an object of the same class of variables, with the formula value column computed

Inherited members

PanamaDataFrame:
- add_col_date
- add_columns
- automatic_join
- drop_column_value
- find_external_keys
- get_columns
- get_columns_by_name
- get_columns_keys
- get_columns_list
- get_extra_cols
- update_columns

Expand source code

class LinearFormulaDataFrame(FormulaDataFrame):
    def __init__(
        self,
        sdf,
        coefficients: List[Union[str, int, float]],
        factor_cols: List[str],
        key_cols: Union[str, List[str], None] = None,
        date_cols: Union[str, List[str], None] = None,
    ):
        super().__init__(sdf, "formula", key_cols, date_cols)
        self.add_columns(key="factor_cols", value=factor_cols)

        self.coefficients = coefficients
        self.set_formula()

    def set_formula(self):
        """Add the linear formula expression to the self LinearFormulaDataFrame"""

        factor_cols = self.get_columns_by_name("factor_cols")
        factors_string = [
            F.concat_ws("*", F.col(coeff), F.col(colnam))
            if isinstance(coeff, str)
            else F.concat_ws("*", F.lit(coeff), F.col(colnam))
            for colnam, coeff in zip(factor_cols, self.coefficients)
        ]

        if len(self.coefficients) > len(factor_cols):
            kt = self.coefficients[-1]
            kt_string = F.col(kt) if isinstance(kt, str) else F.col(kt)  # type: ignore
        else:
            kt_string = F.lit(None)

        self.sdf = self.sdf.withColumn("formula", F.concat_ws("+", kt_string, *factors_string))

    def evaluate(
        self,
        *args: PanamaDataFrame,
        output_col: str = "unitary_amount",
    ) -> PanamaDataFrame:
        """Add a column to a spark dataframe computing the value of the formula column of self.
           The formula can be any valid expression on columns of variables.
           For performance purpose, use LinearFormulaDataFrame if the formula is linear. In general, be careful to use these function in case of many different formulas in formula col.

        Args:
           self (FormulaDataFrame): Formula dataframe containing the formula expressions
           *args (PanamaDataFrame): one or more PanamaDataFrame containing the variables values
           output_col (str, optional): name of the output_col containing the formula value

        Returns:
           formula_sdf: an object of the same class of variables, with the formula value column computed
        """
        # Join the two sdfs and define auxiliary variables
        extra_cols = []
        formula_psdf = self
        factor_cols = self.get_columns_by_name("factor_cols")
        for var in args:
            formula_psdf = var.automatic_join(formula_psdf, how="right")
            extra_cols = list_value_append(extra_cols, var.get_extra_cols())

        factors_value_name = [f + "_value" for f in factor_cols]
        # Evalute the value of the string factors
        for f, v in zip(factor_cols, factors_value_name):
            index_list = get_col_as_list(sdf=self.sdf, colname=f, distinct=True, drop_null=True)
            formula_psdf = formula_psdf.withColumn(
                v, F.coalesce(*[F.when(F.col(f) == i, F.col(i)).otherwise(F.lit(None)) for i in index_list])
            )
            formula_psdf = formula_psdf.withColumn(v, F.coalesce(F.col(v), F.lit(0)))

        # Compute the final linear combination
        formula_psdf = add_linear_combination(
            sdf=formula_psdf, coefficients=self.coefficients, colnames=factors_value_name, output_col=output_col  # type: ignore
        )
        formula_psdf = formula_psdf.add_time_series(col_to_add=output_col, serie_type=None, udm=None)  # type: ignore
        formula_psdf = formula_psdf.drop(*extra_cols, *factors_value_name)

        return formula_psdf

Ancestors

Methods

def evaluate(self, *args: PanamaDataFrame, output_col: str = 'unitary_amount') ‑> PanamaDataFrame

Expand source code

def evaluate(
    self,
    *args: PanamaDataFrame,
    output_col: str = "unitary_amount",
) -> PanamaDataFrame:
    """Add a column to a spark dataframe computing the value of the formula column of self.
       The formula can be any valid expression on columns of variables.
       For performance purpose, use LinearFormulaDataFrame if the formula is linear. In general, be careful to use these function in case of many different formulas in formula col.

    Args:
       self (FormulaDataFrame): Formula dataframe containing the formula expressions
       *args (PanamaDataFrame): one or more PanamaDataFrame containing the variables values
       output_col (str, optional): name of the output_col containing the formula value

    Returns:
       formula_sdf: an object of the same class of variables, with the formula value column computed
    """
    # Join the two sdfs and define auxiliary variables
    extra_cols = []
    formula_psdf = self
    factor_cols = self.get_columns_by_name("factor_cols")
    for var in args:
        formula_psdf = var.automatic_join(formula_psdf, how="right")
        extra_cols = list_value_append(extra_cols, var.get_extra_cols())

    factors_value_name = [f + "_value" for f in factor_cols]
    # Evalute the value of the string factors
    for f, v in zip(factor_cols, factors_value_name):
        index_list = get_col_as_list(sdf=self.sdf, colname=f, distinct=True, drop_null=True)
        formula_psdf = formula_psdf.withColumn(
            v, F.coalesce(*[F.when(F.col(f) == i, F.col(i)).otherwise(F.lit(None)) for i in index_list])
        )
        formula_psdf = formula_psdf.withColumn(v, F.coalesce(F.col(v), F.lit(0)))

    # Compute the final linear combination
    formula_psdf = add_linear_combination(
        sdf=formula_psdf, coefficients=self.coefficients, colnames=factors_value_name, output_col=output_col  # type: ignore
    )
    formula_psdf = formula_psdf.add_time_series(col_to_add=output_col, serie_type=None, udm=None)  # type: ignore
    formula_psdf = formula_psdf.drop(*extra_cols, *factors_value_name)

    return formula_psdf

Args

self : FormulaDataFrame: Formula dataframe containing the formula expressions
*args : PanamaDataFrame: one or more PanamaDataFrame containing the variables values
output_col : str, optional: name of the output_col containing the formula value

Returns

formula_sdf: an object of the same class of variables, with the formula value column computed

def set_formula(self)

Expand source code

def set_formula(self):
    """Add the linear formula expression to the self LinearFormulaDataFrame"""

    factor_cols = self.get_columns_by_name("factor_cols")
    factors_string = [
        F.concat_ws("*", F.col(coeff), F.col(colnam))
        if isinstance(coeff, str)
        else F.concat_ws("*", F.lit(coeff), F.col(colnam))
        for colnam, coeff in zip(factor_cols, self.coefficients)
    ]

    if len(self.coefficients) > len(factor_cols):
        kt = self.coefficients[-1]
        kt_string = F.col(kt) if isinstance(kt, str) else F.col(kt)  # type: ignore
    else:
        kt_string = F.lit(None)

    self.sdf = self.sdf.withColumn("formula", F.concat_ws("+", kt_string, *factors_string))

Add the linear formula expression to the self LinearFormulaDataFrame

Inherited members

FormulaDataFrame:
- add_col_date
- add_columns
- automatic_join
- drop_column_value
- find_external_keys
- get_columns
- get_columns_by_name
- get_columns_keys
- get_columns_list
- get_extra_cols
- update_columns