Coverage for jstark/grocery/recency_weighted

1"""RecencyWeightedBasket feature"""

3from datetime import date

5from jstark.features.feature import DerivedFeature

7from pyspark.sql import Column

8import pyspark.sql.functions as f

10from jstark.feature_period import FeaturePeriod

11from .basket_count import BasketCount

12from .approx_basket_count import ApproxBasketCount

15class RecencyWeightedApproxBasket(DerivedFeature):

16 """RecencyWeightedApproxBasket"""

18 def __init__(

19 self,

20 as_at: date,

21 feature_period: FeaturePeriod,

22 smoothing_factor: float,

23 first_day_of_week: str | None = None,

24 use_absolute_periods: bool = False,

25 ) -> None:

26 super().__init__(as_at, feature_period, first_day_of_week, use_absolute_periods)

27 self.__smoothing_factor = smoothing_factor

29 @property

30 def smoothing_factor(self) -> float:

31 return self.__smoothing_factor

33 def column_expression(self) -> Column:

34 expr = f.lit(0.0)

35 for period in range(self.feature_period.end, self.feature_period.start + 1):

36 expr = expr + ApproxBasketCount(

37 as_at=self.as_at,

38 feature_period=FeaturePeriod(

39 self.feature_period.period_unit_of_measure, period, period

40 ),

41 first_day_of_week=self._first_day_of_week,

42 ).column * pow(self.smoothing_factor, period)

43 return expr

45 def default_value(self) -> Column:

46 return f.lit(None)

48 @property

49 def description_subject(self) -> str:

50 return (

51 "Exponentially weighted moving average, with smoothing factor of"

52 + f" {self.smoothing_factor}, of the approximate number of baskets"

53 + f" per {self.feature_period.period_unit_of_measure.name.lower()}"

54 )

56 @property

57 def feature_name(self) -> str:

58 return (

59 "RecencyWeightedApproxBasket"

60 + f"{self.feature_period.period_unit_of_measure.name.title()}s"

61 + f"{int(self.smoothing_factor * 100)}"

62 + f"_{self.feature_period.mnemonic}"

63 )

65 @property

66 def commentary(self) -> str:

67 return (

68 "Exponential smoothing "

69 + "(https://en.wikipedia.org/wiki/Exponential_smoothing)"

70 + " is an alternative to a simple moving average which"

71 + " gives greater weighting to more recent observations, thus is an"

72 + " exponentially weighted moving average. It uses a smoothing factor"

73 + f" between 0 & 1 which for this feature is {self.smoothing_factor}."

74 + " Here the approximate number of baskets per"

75 + f" {self.feature_period.period_unit_of_measure.name.lower()} is smoothed."

76 + " This feature is considered to be a highly effective predictor of future"

77 + " purchases, if a customer has bought a product recently then there's a"

78 + " relatively high probability they will buy it again."

79 + f" This is less accurate than {self.feature_name.replace('Approx', '')}"

80 + " though is less computationally expensive to calculate because it "

81 + " does not calculate a distinct count for each"

82 + f" {self.feature_period.period_unit_of_measure.name.lower()}."

83 )

85 def __repr__(self) -> str:

86 return (

87 f"{self.__class__.__name__}"

88 f"(as_at={self.as_at}"

89 f", feature_period='{self.feature_period.mnemonic}'"

90 f", smoothing_factor={self.smoothing_factor}"

91 f", first_day_of_week={self._first_day_of_week!r})"

92 )

95class RecencyWeightedBasket(RecencyWeightedApproxBasket):

96 """RecencyWeightedBasket feature"""

98 def __init__(

99 self,

100 as_at: date,

101 feature_period: FeaturePeriod,

102 smoothing_factor: float,

103 first_day_of_week: str | None = None,

104 use_absolute_periods: bool = False,

105 ) -> None:

106 super().__init__(

107 as_at,

108 feature_period,

109 smoothing_factor,

110 first_day_of_week,

111 use_absolute_periods,

112 )

113

114 def column_expression(self) -> Column:

115 expr = f.lit(0.0)

116 for period in range(self.feature_period.end, self.feature_period.start + 1):

117 expr = expr + BasketCount(

118 as_at=self.as_at,

119 feature_period=FeaturePeriod(

120 self.feature_period.period_unit_of_measure, period, period

121 ),

122 first_day_of_week=self._first_day_of_week,

123 ).column * pow(super().smoothing_factor, period)

124 return expr

125

126 @property

127 def description_subject(self) -> str:

128 """simply RecencyWeightedApproxBasketXX_periodmenmonic's description with the

129 word approximate removed"""

130 return super().description_subject.replace("approximate ", "")

131

132 @property

133 def commentary(self) -> str:

134 return (

135 "Exponential smoothing "

136 + "(https://en.wikipedia.org/wiki/Exponential_smoothing)"

137 + " is an alternative to a simple moving average which"

138 + " gives greater weighting to more recent observations, thus is an"

139 + " exponentially weighted moving average. It uses a smoothing factor"

140 + f" between 0 & 1 which for this feature is {self.smoothing_factor}."

141 + " Here the number of baskets per"

142 + f" {self.feature_period.period_unit_of_measure.name.lower()} is smoothed."

143 + " This feature is considered to be a highly effective predictor of future"

144 + " purchases, if a customer has bought a product recently then there's a"

145 + " relatively high probability they will buy it again."

146 + " This is computationally expensive to calculate because it "

147 + " requires a distinct count of baskets for each"

148 + f" {self.feature_period.period_unit_of_measure.name.lower()}. Every"

149 + " distinct count operation is expensive so the less that are performed,"

150 + " the better (YMMV based on a number of factors, "

151 + "mainly the volume of data"

152 + " being processed). For this reason you should consider choosing a small"

153 + f" number of {self.feature_period.period_unit_of_measure.name.lower()}s"

154 + " for the feature period. This feature"

155 + f" ({self.feature_name}) is for"

156 + f" {self.feature_period.number_of_periods}"

157 + f" {self.feature_period.period_unit_of_measure.name.lower()}"

158 + f"{'s' if self.feature_period.number_of_periods > 1 else ''}. You might"

159 + f" consider using {self.feature_name.replace('Basket', 'ApproxBasket')}"

160 + " instead which is less accurate but computationally cheaper."

161 )

162

163 @property

164 def feature_name(self) -> str:

165 """simply RecencyWeightedApproxBasketXX_periodmenmonic's with the

166 word approximate removed"""

167 return super().feature_name.replace("Approx", "")

168

169

170class RecencyWeightedBasket90(RecencyWeightedBasket):

171 def __init__(

172 self,

173 as_at: date,

174 feature_period: FeaturePeriod,

175 first_day_of_week: str | None = None,

176 use_absolute_periods: bool = False,

177 ) -> None:

178 super().__init__(

179 as_at, feature_period, 0.9, first_day_of_week, use_absolute_periods

180 )

181

182

183class RecencyWeightedBasket95(RecencyWeightedBasket):

184 def __init__(

185 self,

186 as_at: date,

187 feature_period: FeaturePeriod,

188 first_day_of_week: str | None = None,

189 use_absolute_periods: bool = False,

190 ) -> None:

191 super().__init__(

192 as_at, feature_period, 0.95, first_day_of_week, use_absolute_periods

193 )

194

195

196class RecencyWeightedBasket99(RecencyWeightedBasket):

197 def __init__(

198 self,

199 as_at: date,

200 feature_period: FeaturePeriod,

201 first_day_of_week: str | None = None,

202 use_absolute_periods: bool = False,

203 ) -> None:

204 super().__init__(

205 as_at, feature_period, 0.99, first_day_of_week, use_absolute_periods

206 )

207

208

209class RecencyWeightedApproxBasket90(RecencyWeightedApproxBasket):

210 def __init__(

211 self,

212 as_at: date,

213 feature_period: FeaturePeriod,

214 first_day_of_week: str | None = None,

215 use_absolute_periods: bool = False,

216 ) -> None:

217 super().__init__(

218 as_at, feature_period, 0.9, first_day_of_week, use_absolute_periods

219 )

220

221

222class RecencyWeightedApproxBasket95(RecencyWeightedApproxBasket):

223 def __init__(

224 self,

225 as_at: date,

226 feature_period: FeaturePeriod,

227 first_day_of_week: str | None = None,

228 use_absolute_periods: bool = False,

229 ) -> None:

230 super().__init__(

231 as_at, feature_period, 0.95, first_day_of_week, use_absolute_periods

232 )

233

234

235class RecencyWeightedApproxBasket99(RecencyWeightedApproxBasket):

236 def __init__(

237 self,

238 as_at: date,

239 feature_period: FeaturePeriod,

240 first_day_of_week: str | None = None,

241 use_absolute_periods: bool = False,

242 ) -> None:

243 super().__init__(

244 as_at, feature_period, 0.99, first_day_of_week, use_absolute_periods

245 )

Coverage for jstark / grocery / recency_weighted_basket.py: 100%

67 statements