Module `panama.analytics.panama_data_frame`

Classes

class PanamaDataFrame (sdf: DataFrame, key_cols: Union[str, List[str], None] = None, date_cols: Union[str, List[str], None] = None)

Expand source code

class PanamaDataFrame(ColumnCollection):
    def __init__(
        self,
        sdf: DataFrame,
        key_cols: Union[str, List[str], None] = None,
        date_cols: Union[str, List[str], None] = None,
    ):
        self.sdf = sdf

        # check that all required columns are in sdf
        _check_columns_in_df(sdf, key_cols=key_cols, date_cols=date_cols)

        # initialize column collection
        super().__init__(key_cols=key_cols, date_cols=date_cols, temporal_cols=date_cols)

    def __getattr__(self, attr: str):
        df_dict = DataFrame.__dict__
        if attr in df_dict:
            sdf_attr = getattr(self.sdf, attr)
            if callable(sdf_attr):

                @functools.wraps(sdf_attr)
                def method_wrapper(*args, **kwargs):
                    method_result = sdf_attr(*args, **kwargs)
                    if isinstance(method_result, DataFrame):
                        key_cols = self.get_columns_by_name("key")
                        date_cols = self.get_columns_by_name("date")
                        _check_columns_to_keep(sdf=method_result, method=attr, values=key_cols)
                        _check_columns_to_keep(sdf=method_result, method=attr, values=date_cols)

                        return PanamaDataFrame(
                            method_result,
                            key_cols=key_cols,  # type: ignore
                            date_cols=date_cols,  # type: ignore
                        )
                    else:
                        return method_result

                return method_wrapper
            else:
                return sdf_attr
        else:
            raise AttributeError(f"PanamaDataFrame object has no attribute '{attr}'")

    def find_external_keys(self, sdf_y: PanamaDataFrame, key_type: str = "key") -> Union[str, List[str]]:
        """Finds common external keys between two PanamaDataFrames to be used for joining

        Args:
            sdf_y : a PanamaDataFrame.

        Returns:
            Union[str, List[str]]: a string or a list of strings with the external keys column names.
        """

        key_name_x = self.get_columns_by_name(key_type)
        key_name_y = sdf_y.get_columns_by_name(key_type)

        if (key_name_y == []) or (all([k in self.sdf.columns for k in key_name_y])):
            return key_name_y
        elif (key_name_x == []) or (all([k in sdf_y.sdf.columns for k in key_name_x])):
            return key_name_x
        else:
            raise ValueError("Cannot find any common external key")

    def get_extra_cols(self) -> List[str]:
        """Returns the list of column names in the PanamaDataFrame that are not used as keys or temporal cols.

        Returns:
            List[str]: the column names list
        """
        mapped_cols = self.get_columns_list()
        extra_cols = [c for c in self.sdf.columns if c not in mapped_cols]

        return extra_cols

    def _is_interval_sdf(self):
        """Check if the PanamaDataFrame is an interval dataframe (with two columns containing a left and right interval extreme)"""
        start_strings = "start|first|from|inizio"
        end_strings = "end|last|to|fine"
        temp_cols = self.get_columns_by_name("date")
        if (
            (temp_cols != [])
            and (re.search(start_strings, temp_cols[0]))
            and (re.search(end_strings, self.get_columns_by_name("date")[1]))
        ):
            return True
        return False

    def automatic_join(self, sdf_y: PanamaDataFrame, how: str = "inner") -> PanamaDataFrame:
        """Joins two PanamaDataFrame objects based on their common keys and temporal information.

        Args:
            sdf_y (TimeSeriesDataFrame): a PanamaDataFrame.
            how (str, optional): the join type (see DataFrame join method). Defaults to "inner".

        Returns:
            PanamaDataFrame: the resulting PanamaDataFrame.
        """

        # interval check
        is_x_interval = self._is_interval_sdf()
        is_y_interval = sdf_y._is_interval_sdf()

        # temporal cols
        x_temporal_cols = self.get_columns_by_name("temporal")
        y_temporal_cols = sdf_y.get_columns_by_name("temporal")

        # date cols
        x_date_cols = self.get_columns_by_name("date")
        y_date_cols = sdf_y.get_columns_by_name("date")

        # Find key and temporal join columns

        is_interval_join = is_x_interval or is_y_interval
        join_key_dict = {}
        final_key_dict = {}
        key_type = ["key"]

        if is_interval_join and (x_temporal_cols != []) and (y_temporal_cols != []):
            join_key_dict["temporal.x"] = x_temporal_cols
            join_key_dict["temporal.y"] = y_temporal_cols
            final_key_dict["temporal"] = join_key_dict["temporal.y"] if is_x_interval else join_key_dict["temporal.x"]
        else:
            key_type.extend(["temporal"])

        for kt in key_type:
            join_key = self.find_external_keys(sdf_y, kt)
            final_key_dict[kt] = (
                sdf_y.get_columns_by_name(kt)
                if join_key == self.get_columns_by_name(kt)
                else self.get_columns_by_name(kt)
            )
            if kt == "temporal":
                join_key_dict["temporal.x"] = join_key_dict["temporal.y"] = join_key
            else:
                join_key_dict[kt] = join_key

        # Write join expression and join

        on_x = list_value_append(join_key_dict["temporal.x"], join_key_dict["key"])
        on_y = list_value_append(join_key_dict["temporal.y"], join_key_dict["key"])
        join_expr = get_join_expr(on_x, on_y, is_interval_join)
        print(f"Automatically performing join using expression:{join_expr._jc}")
        join_df = join_coalesce_common_cols(self.sdf, sdf_y.sdf, join_expr, how)
        if final_key_dict["temporal"] is not None:
            final_key_date = [
                k
                for k in final_key_dict["temporal"]
                if (k in list_value_append(x_date_cols)) or (k in list_value_append(y_date_cols))
            ]
        else:
            final_key_date = None

        return PanamaDataFrame(sdf=join_df, key_cols=final_key_dict["key"], date_cols=final_key_date)

    def add_col_date(self, granularity: str, output_col: Union[str, List[str], None] = None) -> None:
        """Adds a new column to the sdf argument of a PanamaDataFrame with the specified granularity.

        Args:
            granularity (str): a valid granularity.
            output_col (Union[str, List[str]], optional): the column name for the added column. Defaults to None.

        Returns:
        PanamaDataFrame: the resulting PanamaDataFrame.
        """
        list_temporal_col = self.get_columns_by_name("date")
        num_temporal_col = len(list_temporal_col)
        if output_col is None:
            if num_temporal_col == 1:
                output_col = list_value_append(granularity)
            else:
                output_col = [temporal_col + "_" + granularity for temporal_col in list_temporal_col]
        else:
            output_col = list_value_append(output_col)

        for pos in range(num_temporal_col):
            self.sdf = add_col_date(
                sdf=self.sdf, granularity=granularity, date_col=list_temporal_col[pos], output_col=output_col[pos]
            )

Ancestors

ColumnCollection

Subclasses

Methods

def add_col_date(self, granularity: str, output_col: Union[str, List[str], None] = None) ‑> None

Adds a new column to the sdf argument of a PanamaDataFrame with the specified granularity.

Args

granularity : str: a valid granularity.
output_col : Union[str, List[str]], optional: the column name for the added column. Defaults to None.

Returns: PanamaDataFrame: the resulting PanamaDataFrame.

def automatic_join(self, sdf_y: PanamaDataFrame, how: str = 'inner') ‑> PanamaDataFrame

Joins two PanamaDataFrame objects based on their common keys and temporal information.

Args

sdf_y : TimeSeriesDataFrame: a PanamaDataFrame.
how : str, optional: the join type (see DataFrame join method). Defaults to "inner".

Returns

PanamaDataFrame: the resulting PanamaDataFrame.

def find_external_keys(self, sdf_y: PanamaDataFrame, key_type: str = 'key') ‑> Union[str, List[str]]

Finds common external keys between two PanamaDataFrames to be used for joining

Args

sdf_y : a PanamaDataFrame.

Returns

Union[str, List[str]]: a string or a list of strings with the external keys column names.

def get_extra_cols(self) ‑> List[str]

Returns the list of column names in the PanamaDataFrame that are not used as keys or temporal cols.

Returns

List[str]: the column names list

Inherited members

ColumnCollection:
- add_columns
- drop_column_value
- get_columns
- get_columns_by_name
- get_columns_keys
- get_columns_list
- update_columns