Bases: NormalizationBaseClass
Source code in src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/normalization_mean.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62 | class NormalizationMean(NormalizationBaseClass):
NORMALIZED_COLUMN_NAME = "mean"
def _normalize_column(self, df: PySparkDataFrame, column: str) -> PySparkDataFrame:
"""
Private method to apply Mean normalization to the specified column.
Mean normalization: (value - mean) / (max - min)
"""
mean_val = df.select(F.mean(F.col(column))).collect()[0][0]
min_val = df.select(F.min(F.col(column))).collect()[0][0]
max_val = df.select(F.max(F.col(column))).collect()[0][0]
divisor = max_val - min_val
if math.isclose(divisor, 0.0, abs_tol=10e-8) or not math.isfinite(divisor):
raise ZeroDivisionError("Division by Zero in Mean")
store_column = self._get_norm_column_name(column)
self.reversal_value = [mean_val, min_val, max_val]
return df.withColumn(
store_column,
(F.col(column) - F.lit(mean_val)) / (F.lit(max_val) - F.lit(min_val)),
)
def _denormalize_column(
self, df: PySparkDataFrame, column: str
) -> PySparkDataFrame:
"""
Private method to revert Mean normalization to the specified column.
Mean denormalization: normalized_value * (max - min) + mean = value
"""
mean_val = self.reversal_value[0]
min_val = self.reversal_value[1]
max_val = self.reversal_value[2]
store_column = self._get_norm_column_name(column)
return df.withColumn(
store_column,
F.col(column) * (F.lit(max_val) - F.lit(min_val)) + F.lit(mean_val),
)
|