Module panama.analytics.formula_data_frame
Classes
class FormulaDataFrame (sdf: pyspark.sql.dataframe.DataFrame, formula_col: str, key_cols: Union[str, List[str], ForwardRef(None)] = None, date_cols: Union[str, List[str], ForwardRef(None)] = None)
-
Expand source code
class FormulaDataFrame(PanamaDataFrame): def __init__( self, sdf: DataFrame, formula_col: str, key_cols: Union[str, List[str], None] = None, date_cols: Union[str, List[str], None] = None, ): super().__init__(sdf, key_cols=key_cols, date_cols=date_cols) self.add_columns(key="formula_col", value=formula_col) # self.formula_col = formula_col def evaluate(self, *args: PanamaDataFrame, output_col: str = "unitary_amount") -> PanamaDataFrame: """Add a column to a spark dataframe computing the value of the formula column of self. The formula can be any valid expression on columns of variables. For performance purpose, use LinearFormulaDataFrame if the formula is linear. In general, be careful to use these function in case of many different formulas in formula col. Args: self (FormulaDataFrame): Formula dataframe containing the formula expressions args (PanamaDataFrame): PanamaDataFrame containing the variables values output_col (str, optional): name of the output_col containing the formula value Returns: formula_psdf: an object of the same class of variables, with the formula value column computed """ # Join the two sdfs and define auxiliary variables formula_col = self.get_columns_by_name("formula")[0] formula_list = get_col_as_list(sdf=self.sdf, colname=formula_col, distinct=True, drop_null=True) extra_cols = [] formula_psdf = self for var in args: formula_psdf = var.automatic_join(formula_psdf, how="right") extra_cols = list_value_append(extra_cols, var.get_extra_cols()) formula_expression = [ F.when(F.col(formula_col) == formula, F.expr(formula)).otherwise(F.lit(None)) for formula in formula_list ] formula_psdf = formula_psdf.add_time_series( col_to_add=output_col, serie_type=None, udm=None, col=F.coalesce(*formula_expression) ) formula_psdf = formula_psdf.drop(*extra_cols) return formula_psdf
Ancestors
Subclasses
Methods
def evaluate(self, *args: PanamaDataFrame, output_col: str = 'unitary_amount') ‑> PanamaDataFrame
-
Add a column to a spark dataframe computing the value of the formula column of self. The formula can be any valid expression on columns of variables. For performance purpose, use LinearFormulaDataFrame if the formula is linear. In general, be careful to use these function in case of many different formulas in formula col.
Args
self
:FormulaDataFrame
- Formula dataframe containing the formula expressions
args
:PanamaDataFrame
- PanamaDataFrame containing the variables values
output_col
:str
, optional- name of the output_col containing the formula value
Returns
formula_psdf
- an object of the same class of variables, with the formula value column computed
Inherited members
class LinearFormulaDataFrame (sdf, coefficients: List[Union[str, int, float]], factor_cols: List[str], key_cols: Union[str, List[str], ForwardRef(None)] = None, date_cols: Union[str, List[str], ForwardRef(None)] = None)
-
Expand source code
class LinearFormulaDataFrame(FormulaDataFrame): def __init__( self, sdf, coefficients: List[Union[str, int, float]], factor_cols: List[str], key_cols: Union[str, List[str], None] = None, date_cols: Union[str, List[str], None] = None, ): super().__init__(sdf, "formula", key_cols, date_cols) self.add_columns(key="factor_cols", value=factor_cols) self.coefficients = coefficients self.set_formula() def set_formula(self): """Add the linear formula expression to the self LinearFormulaDataFrame""" factor_cols = self.get_columns_by_name("factor_cols") factors_string = [ F.concat_ws("*", F.col(coeff), F.col(colnam)) if isinstance(coeff, str) else F.concat_ws("*", F.lit(coeff), F.col(colnam)) for colnam, coeff in zip(factor_cols, self.coefficients) ] if len(self.coefficients) > len(factor_cols): kt = self.coefficients[-1] kt_string = F.col(kt) if isinstance(kt, str) else F.col(kt) # type: ignore else: kt_string = F.lit(None) self.sdf = self.sdf.withColumn("formula", F.concat_ws("+", kt_string, *factors_string)) def evaluate( self, *args: PanamaDataFrame, output_col: str = "unitary_amount", ) -> PanamaDataFrame: """Add a column to a spark dataframe computing the value of the formula column of self. The formula can be any valid expression on columns of variables. For performance purpose, use LinearFormulaDataFrame if the formula is linear. In general, be careful to use these function in case of many different formulas in formula col. Args: self (FormulaDataFrame): Formula dataframe containing the formula expressions *args (PanamaDataFrame): one or more PanamaDataFrame containing the variables values output_col (str, optional): name of the output_col containing the formula value Returns: formula_sdf: an object of the same class of variables, with the formula value column computed """ # Join the two sdfs and define auxiliary variables extra_cols = [] formula_psdf = self factor_cols = self.get_columns_by_name("factor_cols") for var in args: formula_psdf = var.automatic_join(formula_psdf, how="right") extra_cols = list_value_append(extra_cols, var.get_extra_cols()) factors_value_name = [f + "_value" for f in factor_cols] # Evalute the value of the string factors for f, v in zip(factor_cols, factors_value_name): index_list = get_col_as_list(sdf=self.sdf, colname=f, distinct=True, drop_null=True) formula_psdf = formula_psdf.withColumn( v, F.coalesce(*[F.when(F.col(f) == i, F.col(i)).otherwise(F.lit(None)) for i in index_list]) ) formula_psdf = formula_psdf.withColumn(v, F.coalesce(F.col(v), F.lit(0))) # Compute the final linear combination formula_psdf = add_linear_combination( sdf=formula_psdf, coefficients=self.coefficients, colnames=factors_value_name, output_col=output_col # type: ignore ) formula_psdf = formula_psdf.add_time_series(col_to_add=output_col, serie_type=None, udm=None) # type: ignore formula_psdf = formula_psdf.drop(*extra_cols, *factors_value_name) return formula_psdf
Ancestors
Methods
def evaluate(self, *args: PanamaDataFrame, output_col: str = 'unitary_amount') ‑> PanamaDataFrame
-
Add a column to a spark dataframe computing the value of the formula column of self. The formula can be any valid expression on columns of variables. For performance purpose, use LinearFormulaDataFrame if the formula is linear. In general, be careful to use these function in case of many different formulas in formula col.
Args
self
:FormulaDataFrame
- Formula dataframe containing the formula expressions
*args
:PanamaDataFrame
- one or more PanamaDataFrame containing the variables values
output_col
:str
, optional- name of the output_col containing the formula value
Returns
formula_sdf
- an object of the same class of variables, with the formula value column computed
def set_formula(self)
-
Add the linear formula expression to the self LinearFormulaDataFrame
Inherited members