Coverage for jstark/features/recency_weighted_basket.py: 100%

65 statements  

« prev     ^ index     » next       coverage.py v7.4.3, created at 2024-02-25 20:09 +0000

1"""RecencyWeightedBasket feature""" 

2from datetime import date 

3 

4from .feature import DerivedFeature 

5 

6from pyspark.sql import Column 

7import pyspark.sql.functions as f 

8 

9from jstark.feature_period import FeaturePeriod 

10from .basket_count import BasketCount 

11from .approx_basket_count import ApproxBasketCount 

12 

13 

14class RecencyWeightedApproxBasket(DerivedFeature): 

15 """RecencyWeightedApproxBasket""" 

16 

17 def __init__( 

18 self, as_at: date, feature_period: FeaturePeriod, smoothing_factor: float 

19 ) -> None: 

20 super().__init__(as_at, feature_period) 

21 self.__smoothing_factor = smoothing_factor 

22 

23 @property 

24 def smoothing_factor(self) -> float: 

25 return self.__smoothing_factor 

26 

27 def column_expression(self) -> Column: 

28 expr = f.lit(0.0) 

29 for period in range(self.feature_period.end, self.feature_period.start + 1): 

30 expr = expr + ApproxBasketCount( 

31 as_at=self.as_at, 

32 feature_period=FeaturePeriod( 

33 self.feature_period.period_unit_of_measure, period, period 

34 ), 

35 ).column * pow(self.smoothing_factor, period) 

36 return expr 

37 

38 def default_value(self) -> Column: 

39 return f.lit(None) 

40 

41 @property 

42 def description_subject(self) -> str: 

43 return ( 

44 "Exponentially weighted moving average, with smoothing factor of" 

45 + f" {self.smoothing_factor}, of the approximate number of baskets" 

46 + f" per {self.feature_period.period_unit_of_measure.name.lower()}" 

47 ) 

48 

49 @property 

50 def feature_name(self) -> str: 

51 return ( 

52 "RecencyWeightedApproxBasket" 

53 + f"{self.feature_period.period_unit_of_measure.name.title()}s" 

54 + f"{int(self.smoothing_factor*100)}" 

55 + f"_{self.feature_period.mnemonic}" 

56 ) 

57 

58 @property 

59 def commentary(self) -> str: 

60 return ( 

61 "Exponential smoothing " 

62 + "(https://en.wikipedia.org/wiki/Exponential_smoothing)" 

63 + " is an alternative to a simple moving average which" 

64 + " gives greater weighting to more recent observations, thus is an" 

65 + " exponentially weighted moving average. It uses a smoothing factor" 

66 + f" between 0 & 1 which for this feature is {self.smoothing_factor}." 

67 + " Here the approximate number of baskets per" 

68 + f" {self.feature_period.period_unit_of_measure.name.lower()} is smoothed." 

69 + " This feature is considered to be a highly effective predictor of future" 

70 + " purchases, if a customer has bought a product recently then there's a" 

71 + " relatively high probability they will buy it again." 

72 + f" This is less accurate than {self.feature_name.replace('Approx', '')}" 

73 + " though is less computationally expensive to calculate because it " 

74 + " does not calculate a distinct count for each" 

75 + f" {self.feature_period.period_unit_of_measure.name.lower()}." 

76 ) 

77 

78 

79class RecencyWeightedBasket(RecencyWeightedApproxBasket): 

80 """RecencyWeightedBasket feature""" 

81 

82 def __init__( 

83 self, as_at: date, feature_period: FeaturePeriod, smoothing_factor: float 

84 ) -> None: 

85 super().__init__(as_at, feature_period, smoothing_factor) 

86 

87 def column_expression(self) -> Column: 

88 expr = f.lit(0.0) 

89 for period in range(self.feature_period.end, self.feature_period.start + 1): 

90 expr = expr + BasketCount( 

91 as_at=self.as_at, 

92 feature_period=FeaturePeriod( 

93 self.feature_period.period_unit_of_measure, period, period 

94 ), 

95 ).column * pow(super().smoothing_factor, period) 

96 return expr 

97 

98 @property 

99 def description_subject(self) -> str: 

100 """simply RecencyWeightedApproxBasketXX_periodmenmonic's description with the 

101 word approximate removed""" 

102 return super().description_subject.replace("approximate ", "") 

103 

104 @property 

105 def commentary(self) -> str: 

106 return ( 

107 "Exponential smoothing " 

108 + "(https://en.wikipedia.org/wiki/Exponential_smoothing)" 

109 + " is an alternative to a simple moving average which" 

110 + " gives greater weighting to more recent observations, thus is an" 

111 + " exponentially weighted moving average. It uses a smoothing factor" 

112 + f" between 0 & 1 which for this feature is {self.smoothing_factor}." 

113 + " Here the number of baskets per" 

114 + f" {self.feature_period.period_unit_of_measure.name.lower()} is smoothed." 

115 + " This feature is considered to be a highly effective predictor of future" 

116 + " purchases, if a customer has bought a product recently then there's a" 

117 + " relatively high probability they will buy it again." 

118 + " This is computationally expensive to calculate because it " 

119 + " requires a distinct count of baskets for each" 

120 + f" {self.feature_period.period_unit_of_measure.name.lower()}. Every" 

121 + " distinct count operation is expensive so the less that are performed," 

122 + " the better (YMMV based on a number of factors, " 

123 + "mainly the volume of data" 

124 + " being processed). For this reason you should consider choosing a small" 

125 + f" number of {self.feature_period.period_unit_of_measure.name.lower()}s" 

126 + " for the feature period. This feature" 

127 + f" ({self.feature_name}) is for" 

128 + f" {self.feature_period.number_of_periods}" 

129 + f" {self.feature_period.period_unit_of_measure.name.lower()}" 

130 + f"{'s' if self.feature_period.number_of_periods>1 else ''}. You might" 

131 + f" consider using {self.feature_name.replace('Basket', 'ApproxBasket')}" 

132 + " instead which is less accurate but computationally cheaper." 

133 ) 

134 

135 @property 

136 def feature_name(self) -> str: 

137 """simply RecencyWeightedApproxBasketXX_periodmenmonic's with the 

138 word approximate removed""" 

139 return super().feature_name.replace("Approx", "") 

140 

141 

142class RecencyWeightedBasket90(RecencyWeightedBasket): 

143 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None: 

144 super().__init__(as_at, feature_period, 0.9) 

145 

146 

147class RecencyWeightedBasket95(RecencyWeightedBasket): 

148 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None: 

149 super().__init__(as_at, feature_period, 0.95) 

150 

151 

152class RecencyWeightedBasket99(RecencyWeightedBasket): 

153 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None: 

154 super().__init__(as_at, feature_period, 0.99) 

155 

156 

157class RecencyWeightedApproxBasket90(RecencyWeightedApproxBasket): 

158 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None: 

159 super().__init__(as_at, feature_period, 0.9) 

160 

161 

162class RecencyWeightedApproxBasket95(RecencyWeightedApproxBasket): 

163 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None: 

164 super().__init__(as_at, feature_period, 0.95) 

165 

166 

167class RecencyWeightedApproxBasket99(RecencyWeightedApproxBasket): 

168 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None: 

169 super().__init__(as_at, feature_period, 0.99)