Module panama.analytics.panama_data_frame
Classes
class PanamaDataFrame (sdf: DataFrame, key_cols: Union[str, List[str], None] = None, date_cols: Union[str, List[str], None] = None)
-
Expand source code
class PanamaDataFrame(ColumnCollection): def __init__( self, sdf: DataFrame, key_cols: Union[str, List[str], None] = None, date_cols: Union[str, List[str], None] = None, ): self.sdf = sdf # check that all required columns are in sdf _check_columns_in_df(sdf, key_cols=key_cols, date_cols=date_cols) # initialize column collection super().__init__(key_cols=key_cols, date_cols=date_cols, temporal_cols=date_cols) def __getattr__(self, attr: str): df_dict = DataFrame.__dict__ if attr in df_dict: sdf_attr = getattr(self.sdf, attr) if callable(sdf_attr): @functools.wraps(sdf_attr) def method_wrapper(*args, **kwargs): method_result = sdf_attr(*args, **kwargs) if isinstance(method_result, DataFrame): key_cols = self.get_columns_by_name("key") date_cols = self.get_columns_by_name("date") _check_columns_to_keep(sdf=method_result, method=attr, values=key_cols) _check_columns_to_keep(sdf=method_result, method=attr, values=date_cols) return PanamaDataFrame( method_result, key_cols=key_cols, # type: ignore date_cols=date_cols, # type: ignore ) else: return method_result return method_wrapper else: return sdf_attr else: raise AttributeError(f"PanamaDataFrame object has no attribute '{attr}'") def find_external_keys(self, sdf_y: PanamaDataFrame, key_type: str = "key") -> Union[str, List[str]]: """Finds common external keys between two PanamaDataFrames to be used for joining Args: sdf_y : a PanamaDataFrame. Returns: Union[str, List[str]]: a string or a list of strings with the external keys column names. """ key_name_x = self.get_columns_by_name(key_type) key_name_y = sdf_y.get_columns_by_name(key_type) if (key_name_y == []) or (all([k in self.sdf.columns for k in key_name_y])): return key_name_y elif (key_name_x == []) or (all([k in sdf_y.sdf.columns for k in key_name_x])): return key_name_x else: raise ValueError("Cannot find any common external key") def get_extra_cols(self) -> List[str]: """Returns the list of column names in the PanamaDataFrame that are not used as keys or temporal cols. Returns: List[str]: the column names list """ mapped_cols = self.get_columns_list() extra_cols = [c for c in self.sdf.columns if c not in mapped_cols] return extra_cols def _is_interval_sdf(self): """Check if the PanamaDataFrame is an interval dataframe (with two columns containing a left and right interval extreme)""" start_strings = "start|first|from|inizio" end_strings = "end|last|to|fine" temp_cols = self.get_columns_by_name("date") if ( (temp_cols != []) and (re.search(start_strings, temp_cols[0])) and (re.search(end_strings, self.get_columns_by_name("date")[1])) ): return True return False def automatic_join(self, sdf_y: PanamaDataFrame, how: str = "inner") -> PanamaDataFrame: """Joins two PanamaDataFrame objects based on their common keys and temporal information. Args: sdf_y (TimeSeriesDataFrame): a PanamaDataFrame. how (str, optional): the join type (see DataFrame join method). Defaults to "inner". Returns: PanamaDataFrame: the resulting PanamaDataFrame. """ # interval check is_x_interval = self._is_interval_sdf() is_y_interval = sdf_y._is_interval_sdf() # temporal cols x_temporal_cols = self.get_columns_by_name("temporal") y_temporal_cols = sdf_y.get_columns_by_name("temporal") # date cols x_date_cols = self.get_columns_by_name("date") y_date_cols = sdf_y.get_columns_by_name("date") # Find key and temporal join columns is_interval_join = is_x_interval or is_y_interval join_key_dict = {} final_key_dict = {} key_type = ["key"] if is_interval_join and (x_temporal_cols != []) and (y_temporal_cols != []): join_key_dict["temporal.x"] = x_temporal_cols join_key_dict["temporal.y"] = y_temporal_cols final_key_dict["temporal"] = join_key_dict["temporal.y"] if is_x_interval else join_key_dict["temporal.x"] else: key_type.extend(["temporal"]) for kt in key_type: join_key = self.find_external_keys(sdf_y, kt) final_key_dict[kt] = ( sdf_y.get_columns_by_name(kt) if join_key == self.get_columns_by_name(kt) else self.get_columns_by_name(kt) ) if kt == "temporal": join_key_dict["temporal.x"] = join_key_dict["temporal.y"] = join_key else: join_key_dict[kt] = join_key # Write join expression and join on_x = list_value_append(join_key_dict["temporal.x"], join_key_dict["key"]) on_y = list_value_append(join_key_dict["temporal.y"], join_key_dict["key"]) join_expr = get_join_expr(on_x, on_y, is_interval_join) print(f"Automatically performing join using expression:{join_expr._jc}") join_df = join_coalesce_common_cols(self.sdf, sdf_y.sdf, join_expr, how) if final_key_dict["temporal"] is not None: final_key_date = [ k for k in final_key_dict["temporal"] if (k in list_value_append(x_date_cols)) or (k in list_value_append(y_date_cols)) ] else: final_key_date = None return PanamaDataFrame(sdf=join_df, key_cols=final_key_dict["key"], date_cols=final_key_date) def add_col_date(self, granularity: str, output_col: Union[str, List[str], None] = None) -> None: """Adds a new column to the sdf argument of a PanamaDataFrame with the specified granularity. Args: granularity (str): a valid granularity. output_col (Union[str, List[str]], optional): the column name for the added column. Defaults to None. Returns: PanamaDataFrame: the resulting PanamaDataFrame. """ list_temporal_col = self.get_columns_by_name("date") num_temporal_col = len(list_temporal_col) if output_col is None: if num_temporal_col == 1: output_col = list_value_append(granularity) else: output_col = [temporal_col + "_" + granularity for temporal_col in list_temporal_col] else: output_col = list_value_append(output_col) for pos in range(num_temporal_col): self.sdf = add_col_date( sdf=self.sdf, granularity=granularity, date_col=list_temporal_col[pos], output_col=output_col[pos] )
Ancestors
Subclasses
Methods
def add_col_date(self, granularity: str, output_col: Union[str, List[str], None] = None) ‑> None
-
Adds a new column to the sdf argument of a PanamaDataFrame with the specified granularity.
Args
granularity
:str
- a valid granularity.
output_col
:Union[str, List[str]]
, optional- the column name for the added column. Defaults to None.
Returns: PanamaDataFrame: the resulting PanamaDataFrame.
def automatic_join(self, sdf_y: PanamaDataFrame, how: str = 'inner') ‑> PanamaDataFrame
-
Joins two PanamaDataFrame objects based on their common keys and temporal information.
Args
sdf_y
:TimeSeriesDataFrame
- a PanamaDataFrame.
how
:str
, optional- the join type (see DataFrame join method). Defaults to "inner".
Returns
PanamaDataFrame
- the resulting PanamaDataFrame.
def find_external_keys(self, sdf_y: PanamaDataFrame, key_type: str = 'key') ‑> Union[str, List[str]]
-
Finds common external keys between two PanamaDataFrames to be used for joining
Args
sdf_y : a PanamaDataFrame.
Returns
Union[str, List[str]]
- a string or a list of strings with the external keys column names.
def get_extra_cols(self) ‑> List[str]
-
Returns the list of column names in the PanamaDataFrame that are not used as keys or temporal cols.
Returns
List[str]
- the column names list
Inherited members