Coverage for jstark / grocery / recency_weighted_basket.py: 100%

67 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-23 22:34 +0000

1"""RecencyWeightedBasket feature""" 

2 

3from datetime import date 

4 

5from jstark.features.feature import DerivedFeature 

6 

7from pyspark.sql import Column 

8import pyspark.sql.functions as f 

9 

10from jstark.feature_period import FeaturePeriod 

11from .basket_count import BasketCount 

12from .approx_basket_count import ApproxBasketCount 

13 

14 

15class RecencyWeightedApproxBasket(DerivedFeature): 

16 """RecencyWeightedApproxBasket""" 

17 

18 def __init__( 

19 self, 

20 as_at: date, 

21 feature_period: FeaturePeriod, 

22 smoothing_factor: float, 

23 first_day_of_week: str | None = None, 

24 use_absolute_periods: bool = False, 

25 ) -> None: 

26 super().__init__(as_at, feature_period, first_day_of_week, use_absolute_periods) 

27 self.__smoothing_factor = smoothing_factor 

28 

29 @property 

30 def smoothing_factor(self) -> float: 

31 return self.__smoothing_factor 

32 

33 def column_expression(self) -> Column: 

34 expr = f.lit(0.0) 

35 for period in range(self.feature_period.end, self.feature_period.start + 1): 

36 expr = expr + ApproxBasketCount( 

37 as_at=self.as_at, 

38 feature_period=FeaturePeriod( 

39 self.feature_period.period_unit_of_measure, period, period 

40 ), 

41 first_day_of_week=self._first_day_of_week, 

42 ).column * pow(self.smoothing_factor, period) 

43 return expr 

44 

45 def default_value(self) -> Column: 

46 return f.lit(None) 

47 

48 @property 

49 def description_subject(self) -> str: 

50 return ( 

51 "Exponentially weighted moving average, with smoothing factor of" 

52 + f" {self.smoothing_factor}, of the approximate number of baskets" 

53 + f" per {self.feature_period.period_unit_of_measure.name.lower()}" 

54 ) 

55 

56 @property 

57 def feature_name(self) -> str: 

58 return ( 

59 "RecencyWeightedApproxBasket" 

60 + f"{self.feature_period.period_unit_of_measure.name.title()}s" 

61 + f"{int(self.smoothing_factor * 100)}" 

62 + f"_{self.feature_period.mnemonic}" 

63 ) 

64 

65 @property 

66 def commentary(self) -> str: 

67 return ( 

68 "Exponential smoothing " 

69 + "(https://en.wikipedia.org/wiki/Exponential_smoothing)" 

70 + " is an alternative to a simple moving average which" 

71 + " gives greater weighting to more recent observations, thus is an" 

72 + " exponentially weighted moving average. It uses a smoothing factor" 

73 + f" between 0 & 1 which for this feature is {self.smoothing_factor}." 

74 + " Here the approximate number of baskets per" 

75 + f" {self.feature_period.period_unit_of_measure.name.lower()} is smoothed." 

76 + " This feature is considered to be a highly effective predictor of future" 

77 + " purchases, if a customer has bought a product recently then there's a" 

78 + " relatively high probability they will buy it again." 

79 + f" This is less accurate than {self.feature_name.replace('Approx', '')}" 

80 + " though is less computationally expensive to calculate because it " 

81 + " does not calculate a distinct count for each" 

82 + f" {self.feature_period.period_unit_of_measure.name.lower()}." 

83 ) 

84 

85 def __repr__(self) -> str: 

86 return ( 

87 f"{self.__class__.__name__}" 

88 f"(as_at={self.as_at}" 

89 f", feature_period='{self.feature_period.mnemonic}'" 

90 f", smoothing_factor={self.smoothing_factor}" 

91 f", first_day_of_week={self._first_day_of_week!r})" 

92 ) 

93 

94 

95class RecencyWeightedBasket(RecencyWeightedApproxBasket): 

96 """RecencyWeightedBasket feature""" 

97 

98 def __init__( 

99 self, 

100 as_at: date, 

101 feature_period: FeaturePeriod, 

102 smoothing_factor: float, 

103 first_day_of_week: str | None = None, 

104 use_absolute_periods: bool = False, 

105 ) -> None: 

106 super().__init__( 

107 as_at, 

108 feature_period, 

109 smoothing_factor, 

110 first_day_of_week, 

111 use_absolute_periods, 

112 ) 

113 

114 def column_expression(self) -> Column: 

115 expr = f.lit(0.0) 

116 for period in range(self.feature_period.end, self.feature_period.start + 1): 

117 expr = expr + BasketCount( 

118 as_at=self.as_at, 

119 feature_period=FeaturePeriod( 

120 self.feature_period.period_unit_of_measure, period, period 

121 ), 

122 first_day_of_week=self._first_day_of_week, 

123 ).column * pow(super().smoothing_factor, period) 

124 return expr 

125 

126 @property 

127 def description_subject(self) -> str: 

128 """simply RecencyWeightedApproxBasketXX_periodmenmonic's description with the 

129 word approximate removed""" 

130 return super().description_subject.replace("approximate ", "") 

131 

132 @property 

133 def commentary(self) -> str: 

134 return ( 

135 "Exponential smoothing " 

136 + "(https://en.wikipedia.org/wiki/Exponential_smoothing)" 

137 + " is an alternative to a simple moving average which" 

138 + " gives greater weighting to more recent observations, thus is an" 

139 + " exponentially weighted moving average. It uses a smoothing factor" 

140 + f" between 0 & 1 which for this feature is {self.smoothing_factor}." 

141 + " Here the number of baskets per" 

142 + f" {self.feature_period.period_unit_of_measure.name.lower()} is smoothed." 

143 + " This feature is considered to be a highly effective predictor of future" 

144 + " purchases, if a customer has bought a product recently then there's a" 

145 + " relatively high probability they will buy it again." 

146 + " This is computationally expensive to calculate because it " 

147 + " requires a distinct count of baskets for each" 

148 + f" {self.feature_period.period_unit_of_measure.name.lower()}. Every" 

149 + " distinct count operation is expensive so the less that are performed," 

150 + " the better (YMMV based on a number of factors, " 

151 + "mainly the volume of data" 

152 + " being processed). For this reason you should consider choosing a small" 

153 + f" number of {self.feature_period.period_unit_of_measure.name.lower()}s" 

154 + " for the feature period. This feature" 

155 + f" ({self.feature_name}) is for" 

156 + f" {self.feature_period.number_of_periods}" 

157 + f" {self.feature_period.period_unit_of_measure.name.lower()}" 

158 + f"{'s' if self.feature_period.number_of_periods > 1 else ''}. You might" 

159 + f" consider using {self.feature_name.replace('Basket', 'ApproxBasket')}" 

160 + " instead which is less accurate but computationally cheaper." 

161 ) 

162 

163 @property 

164 def feature_name(self) -> str: 

165 """simply RecencyWeightedApproxBasketXX_periodmenmonic's with the 

166 word approximate removed""" 

167 return super().feature_name.replace("Approx", "") 

168 

169 

170class RecencyWeightedBasket90(RecencyWeightedBasket): 

171 def __init__( 

172 self, 

173 as_at: date, 

174 feature_period: FeaturePeriod, 

175 first_day_of_week: str | None = None, 

176 use_absolute_periods: bool = False, 

177 ) -> None: 

178 super().__init__( 

179 as_at, feature_period, 0.9, first_day_of_week, use_absolute_periods 

180 ) 

181 

182 

183class RecencyWeightedBasket95(RecencyWeightedBasket): 

184 def __init__( 

185 self, 

186 as_at: date, 

187 feature_period: FeaturePeriod, 

188 first_day_of_week: str | None = None, 

189 use_absolute_periods: bool = False, 

190 ) -> None: 

191 super().__init__( 

192 as_at, feature_period, 0.95, first_day_of_week, use_absolute_periods 

193 ) 

194 

195 

196class RecencyWeightedBasket99(RecencyWeightedBasket): 

197 def __init__( 

198 self, 

199 as_at: date, 

200 feature_period: FeaturePeriod, 

201 first_day_of_week: str | None = None, 

202 use_absolute_periods: bool = False, 

203 ) -> None: 

204 super().__init__( 

205 as_at, feature_period, 0.99, first_day_of_week, use_absolute_periods 

206 ) 

207 

208 

209class RecencyWeightedApproxBasket90(RecencyWeightedApproxBasket): 

210 def __init__( 

211 self, 

212 as_at: date, 

213 feature_period: FeaturePeriod, 

214 first_day_of_week: str | None = None, 

215 use_absolute_periods: bool = False, 

216 ) -> None: 

217 super().__init__( 

218 as_at, feature_period, 0.9, first_day_of_week, use_absolute_periods 

219 ) 

220 

221 

222class RecencyWeightedApproxBasket95(RecencyWeightedApproxBasket): 

223 def __init__( 

224 self, 

225 as_at: date, 

226 feature_period: FeaturePeriod, 

227 first_day_of_week: str | None = None, 

228 use_absolute_periods: bool = False, 

229 ) -> None: 

230 super().__init__( 

231 as_at, feature_period, 0.95, first_day_of_week, use_absolute_periods 

232 ) 

233 

234 

235class RecencyWeightedApproxBasket99(RecencyWeightedApproxBasket): 

236 def __init__( 

237 self, 

238 as_at: date, 

239 feature_period: FeaturePeriod, 

240 first_day_of_week: str | None = None, 

241 use_absolute_periods: bool = False, 

242 ) -> None: 

243 super().__init__( 

244 as_at, feature_period, 0.99, first_day_of_week, use_absolute_periods 

245 )