Coverage for jstark / grocery / recency_weighted_basket.py: 100%
67 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-23 22:34 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-23 22:34 +0000
1"""RecencyWeightedBasket feature"""
3from datetime import date
5from jstark.features.feature import DerivedFeature
7from pyspark.sql import Column
8import pyspark.sql.functions as f
10from jstark.feature_period import FeaturePeriod
11from .basket_count import BasketCount
12from .approx_basket_count import ApproxBasketCount
15class RecencyWeightedApproxBasket(DerivedFeature):
16 """RecencyWeightedApproxBasket"""
18 def __init__(
19 self,
20 as_at: date,
21 feature_period: FeaturePeriod,
22 smoothing_factor: float,
23 first_day_of_week: str | None = None,
24 use_absolute_periods: bool = False,
25 ) -> None:
26 super().__init__(as_at, feature_period, first_day_of_week, use_absolute_periods)
27 self.__smoothing_factor = smoothing_factor
29 @property
30 def smoothing_factor(self) -> float:
31 return self.__smoothing_factor
33 def column_expression(self) -> Column:
34 expr = f.lit(0.0)
35 for period in range(self.feature_period.end, self.feature_period.start + 1):
36 expr = expr + ApproxBasketCount(
37 as_at=self.as_at,
38 feature_period=FeaturePeriod(
39 self.feature_period.period_unit_of_measure, period, period
40 ),
41 first_day_of_week=self._first_day_of_week,
42 ).column * pow(self.smoothing_factor, period)
43 return expr
45 def default_value(self) -> Column:
46 return f.lit(None)
48 @property
49 def description_subject(self) -> str:
50 return (
51 "Exponentially weighted moving average, with smoothing factor of"
52 + f" {self.smoothing_factor}, of the approximate number of baskets"
53 + f" per {self.feature_period.period_unit_of_measure.name.lower()}"
54 )
56 @property
57 def feature_name(self) -> str:
58 return (
59 "RecencyWeightedApproxBasket"
60 + f"{self.feature_period.period_unit_of_measure.name.title()}s"
61 + f"{int(self.smoothing_factor * 100)}"
62 + f"_{self.feature_period.mnemonic}"
63 )
65 @property
66 def commentary(self) -> str:
67 return (
68 "Exponential smoothing "
69 + "(https://en.wikipedia.org/wiki/Exponential_smoothing)"
70 + " is an alternative to a simple moving average which"
71 + " gives greater weighting to more recent observations, thus is an"
72 + " exponentially weighted moving average. It uses a smoothing factor"
73 + f" between 0 & 1 which for this feature is {self.smoothing_factor}."
74 + " Here the approximate number of baskets per"
75 + f" {self.feature_period.period_unit_of_measure.name.lower()} is smoothed."
76 + " This feature is considered to be a highly effective predictor of future"
77 + " purchases, if a customer has bought a product recently then there's a"
78 + " relatively high probability they will buy it again."
79 + f" This is less accurate than {self.feature_name.replace('Approx', '')}"
80 + " though is less computationally expensive to calculate because it "
81 + " does not calculate a distinct count for each"
82 + f" {self.feature_period.period_unit_of_measure.name.lower()}."
83 )
85 def __repr__(self) -> str:
86 return (
87 f"{self.__class__.__name__}"
88 f"(as_at={self.as_at}"
89 f", feature_period='{self.feature_period.mnemonic}'"
90 f", smoothing_factor={self.smoothing_factor}"
91 f", first_day_of_week={self._first_day_of_week!r})"
92 )
95class RecencyWeightedBasket(RecencyWeightedApproxBasket):
96 """RecencyWeightedBasket feature"""
98 def __init__(
99 self,
100 as_at: date,
101 feature_period: FeaturePeriod,
102 smoothing_factor: float,
103 first_day_of_week: str | None = None,
104 use_absolute_periods: bool = False,
105 ) -> None:
106 super().__init__(
107 as_at,
108 feature_period,
109 smoothing_factor,
110 first_day_of_week,
111 use_absolute_periods,
112 )
114 def column_expression(self) -> Column:
115 expr = f.lit(0.0)
116 for period in range(self.feature_period.end, self.feature_period.start + 1):
117 expr = expr + BasketCount(
118 as_at=self.as_at,
119 feature_period=FeaturePeriod(
120 self.feature_period.period_unit_of_measure, period, period
121 ),
122 first_day_of_week=self._first_day_of_week,
123 ).column * pow(super().smoothing_factor, period)
124 return expr
126 @property
127 def description_subject(self) -> str:
128 """simply RecencyWeightedApproxBasketXX_periodmenmonic's description with the
129 word approximate removed"""
130 return super().description_subject.replace("approximate ", "")
132 @property
133 def commentary(self) -> str:
134 return (
135 "Exponential smoothing "
136 + "(https://en.wikipedia.org/wiki/Exponential_smoothing)"
137 + " is an alternative to a simple moving average which"
138 + " gives greater weighting to more recent observations, thus is an"
139 + " exponentially weighted moving average. It uses a smoothing factor"
140 + f" between 0 & 1 which for this feature is {self.smoothing_factor}."
141 + " Here the number of baskets per"
142 + f" {self.feature_period.period_unit_of_measure.name.lower()} is smoothed."
143 + " This feature is considered to be a highly effective predictor of future"
144 + " purchases, if a customer has bought a product recently then there's a"
145 + " relatively high probability they will buy it again."
146 + " This is computationally expensive to calculate because it "
147 + " requires a distinct count of baskets for each"
148 + f" {self.feature_period.period_unit_of_measure.name.lower()}. Every"
149 + " distinct count operation is expensive so the less that are performed,"
150 + " the better (YMMV based on a number of factors, "
151 + "mainly the volume of data"
152 + " being processed). For this reason you should consider choosing a small"
153 + f" number of {self.feature_period.period_unit_of_measure.name.lower()}s"
154 + " for the feature period. This feature"
155 + f" ({self.feature_name}) is for"
156 + f" {self.feature_period.number_of_periods}"
157 + f" {self.feature_period.period_unit_of_measure.name.lower()}"
158 + f"{'s' if self.feature_period.number_of_periods > 1 else ''}. You might"
159 + f" consider using {self.feature_name.replace('Basket', 'ApproxBasket')}"
160 + " instead which is less accurate but computationally cheaper."
161 )
163 @property
164 def feature_name(self) -> str:
165 """simply RecencyWeightedApproxBasketXX_periodmenmonic's with the
166 word approximate removed"""
167 return super().feature_name.replace("Approx", "")
170class RecencyWeightedBasket90(RecencyWeightedBasket):
171 def __init__(
172 self,
173 as_at: date,
174 feature_period: FeaturePeriod,
175 first_day_of_week: str | None = None,
176 use_absolute_periods: bool = False,
177 ) -> None:
178 super().__init__(
179 as_at, feature_period, 0.9, first_day_of_week, use_absolute_periods
180 )
183class RecencyWeightedBasket95(RecencyWeightedBasket):
184 def __init__(
185 self,
186 as_at: date,
187 feature_period: FeaturePeriod,
188 first_day_of_week: str | None = None,
189 use_absolute_periods: bool = False,
190 ) -> None:
191 super().__init__(
192 as_at, feature_period, 0.95, first_day_of_week, use_absolute_periods
193 )
196class RecencyWeightedBasket99(RecencyWeightedBasket):
197 def __init__(
198 self,
199 as_at: date,
200 feature_period: FeaturePeriod,
201 first_day_of_week: str | None = None,
202 use_absolute_periods: bool = False,
203 ) -> None:
204 super().__init__(
205 as_at, feature_period, 0.99, first_day_of_week, use_absolute_periods
206 )
209class RecencyWeightedApproxBasket90(RecencyWeightedApproxBasket):
210 def __init__(
211 self,
212 as_at: date,
213 feature_period: FeaturePeriod,
214 first_day_of_week: str | None = None,
215 use_absolute_periods: bool = False,
216 ) -> None:
217 super().__init__(
218 as_at, feature_period, 0.9, first_day_of_week, use_absolute_periods
219 )
222class RecencyWeightedApproxBasket95(RecencyWeightedApproxBasket):
223 def __init__(
224 self,
225 as_at: date,
226 feature_period: FeaturePeriod,
227 first_day_of_week: str | None = None,
228 use_absolute_periods: bool = False,
229 ) -> None:
230 super().__init__(
231 as_at, feature_period, 0.95, first_day_of_week, use_absolute_periods
232 )
235class RecencyWeightedApproxBasket99(RecencyWeightedApproxBasket):
236 def __init__(
237 self,
238 as_at: date,
239 feature_period: FeaturePeriod,
240 first_day_of_week: str | None = None,
241 use_absolute_periods: bool = False,
242 ) -> None:
243 super().__init__(
244 as_at, feature_period, 0.99, first_day_of_week, use_absolute_periods
245 )