Coverage for jstark/features/recency_weighted_basket.py: 100%
65 statements
« prev ^ index » next coverage.py v7.4.3, created at 2024-02-25 20:09 +0000
« prev ^ index » next coverage.py v7.4.3, created at 2024-02-25 20:09 +0000
1"""RecencyWeightedBasket feature"""
2from datetime import date
4from .feature import DerivedFeature
6from pyspark.sql import Column
7import pyspark.sql.functions as f
9from jstark.feature_period import FeaturePeriod
10from .basket_count import BasketCount
11from .approx_basket_count import ApproxBasketCount
14class RecencyWeightedApproxBasket(DerivedFeature):
15 """RecencyWeightedApproxBasket"""
17 def __init__(
18 self, as_at: date, feature_period: FeaturePeriod, smoothing_factor: float
19 ) -> None:
20 super().__init__(as_at, feature_period)
21 self.__smoothing_factor = smoothing_factor
23 @property
24 def smoothing_factor(self) -> float:
25 return self.__smoothing_factor
27 def column_expression(self) -> Column:
28 expr = f.lit(0.0)
29 for period in range(self.feature_period.end, self.feature_period.start + 1):
30 expr = expr + ApproxBasketCount(
31 as_at=self.as_at,
32 feature_period=FeaturePeriod(
33 self.feature_period.period_unit_of_measure, period, period
34 ),
35 ).column * pow(self.smoothing_factor, period)
36 return expr
38 def default_value(self) -> Column:
39 return f.lit(None)
41 @property
42 def description_subject(self) -> str:
43 return (
44 "Exponentially weighted moving average, with smoothing factor of"
45 + f" {self.smoothing_factor}, of the approximate number of baskets"
46 + f" per {self.feature_period.period_unit_of_measure.name.lower()}"
47 )
49 @property
50 def feature_name(self) -> str:
51 return (
52 "RecencyWeightedApproxBasket"
53 + f"{self.feature_period.period_unit_of_measure.name.title()}s"
54 + f"{int(self.smoothing_factor*100)}"
55 + f"_{self.feature_period.mnemonic}"
56 )
58 @property
59 def commentary(self) -> str:
60 return (
61 "Exponential smoothing "
62 + "(https://en.wikipedia.org/wiki/Exponential_smoothing)"
63 + " is an alternative to a simple moving average which"
64 + " gives greater weighting to more recent observations, thus is an"
65 + " exponentially weighted moving average. It uses a smoothing factor"
66 + f" between 0 & 1 which for this feature is {self.smoothing_factor}."
67 + " Here the approximate number of baskets per"
68 + f" {self.feature_period.period_unit_of_measure.name.lower()} is smoothed."
69 + " This feature is considered to be a highly effective predictor of future"
70 + " purchases, if a customer has bought a product recently then there's a"
71 + " relatively high probability they will buy it again."
72 + f" This is less accurate than {self.feature_name.replace('Approx', '')}"
73 + " though is less computationally expensive to calculate because it "
74 + " does not calculate a distinct count for each"
75 + f" {self.feature_period.period_unit_of_measure.name.lower()}."
76 )
79class RecencyWeightedBasket(RecencyWeightedApproxBasket):
80 """RecencyWeightedBasket feature"""
82 def __init__(
83 self, as_at: date, feature_period: FeaturePeriod, smoothing_factor: float
84 ) -> None:
85 super().__init__(as_at, feature_period, smoothing_factor)
87 def column_expression(self) -> Column:
88 expr = f.lit(0.0)
89 for period in range(self.feature_period.end, self.feature_period.start + 1):
90 expr = expr + BasketCount(
91 as_at=self.as_at,
92 feature_period=FeaturePeriod(
93 self.feature_period.period_unit_of_measure, period, period
94 ),
95 ).column * pow(super().smoothing_factor, period)
96 return expr
98 @property
99 def description_subject(self) -> str:
100 """simply RecencyWeightedApproxBasketXX_periodmenmonic's description with the
101 word approximate removed"""
102 return super().description_subject.replace("approximate ", "")
104 @property
105 def commentary(self) -> str:
106 return (
107 "Exponential smoothing "
108 + "(https://en.wikipedia.org/wiki/Exponential_smoothing)"
109 + " is an alternative to a simple moving average which"
110 + " gives greater weighting to more recent observations, thus is an"
111 + " exponentially weighted moving average. It uses a smoothing factor"
112 + f" between 0 & 1 which for this feature is {self.smoothing_factor}."
113 + " Here the number of baskets per"
114 + f" {self.feature_period.period_unit_of_measure.name.lower()} is smoothed."
115 + " This feature is considered to be a highly effective predictor of future"
116 + " purchases, if a customer has bought a product recently then there's a"
117 + " relatively high probability they will buy it again."
118 + " This is computationally expensive to calculate because it "
119 + " requires a distinct count of baskets for each"
120 + f" {self.feature_period.period_unit_of_measure.name.lower()}. Every"
121 + " distinct count operation is expensive so the less that are performed,"
122 + " the better (YMMV based on a number of factors, "
123 + "mainly the volume of data"
124 + " being processed). For this reason you should consider choosing a small"
125 + f" number of {self.feature_period.period_unit_of_measure.name.lower()}s"
126 + " for the feature period. This feature"
127 + f" ({self.feature_name}) is for"
128 + f" {self.feature_period.number_of_periods}"
129 + f" {self.feature_period.period_unit_of_measure.name.lower()}"
130 + f"{'s' if self.feature_period.number_of_periods>1 else ''}. You might"
131 + f" consider using {self.feature_name.replace('Basket', 'ApproxBasket')}"
132 + " instead which is less accurate but computationally cheaper."
133 )
135 @property
136 def feature_name(self) -> str:
137 """simply RecencyWeightedApproxBasketXX_periodmenmonic's with the
138 word approximate removed"""
139 return super().feature_name.replace("Approx", "")
142class RecencyWeightedBasket90(RecencyWeightedBasket):
143 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:
144 super().__init__(as_at, feature_period, 0.9)
147class RecencyWeightedBasket95(RecencyWeightedBasket):
148 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:
149 super().__init__(as_at, feature_period, 0.95)
152class RecencyWeightedBasket99(RecencyWeightedBasket):
153 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:
154 super().__init__(as_at, feature_period, 0.99)
157class RecencyWeightedApproxBasket90(RecencyWeightedApproxBasket):
158 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:
159 super().__init__(as_at, feature_period, 0.9)
162class RecencyWeightedApproxBasket95(RecencyWeightedApproxBasket):
163 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:
164 super().__init__(as_at, feature_period, 0.95)
167class RecencyWeightedApproxBasket99(RecencyWeightedApproxBasket):
168 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:
169 super().__init__(as_at, feature_period, 0.99)