Coverage for jstark/features/recency_weighted

1"""RecencyWeightedBasket feature"""

2from datetime import date

4from .feature import DerivedFeature

6from pyspark.sql import Column

7import pyspark.sql.functions as f

9from jstark.feature_period import FeaturePeriod

10from .basket_count import BasketCount

11from .approx_basket_count import ApproxBasketCount

14class RecencyWeightedApproxBasket(DerivedFeature):

15 """RecencyWeightedApproxBasket"""

17 def __init__(

18 self, as_at: date, feature_period: FeaturePeriod, smoothing_factor: float

19 ) -> None:

20 super().__init__(as_at, feature_period)

21 self.__smoothing_factor = smoothing_factor

23 @property

24 def smoothing_factor(self) -> float:

25 return self.__smoothing_factor

27 def column_expression(self) -> Column:

28 expr = f.lit(0.0)

29 for period in range(self.feature_period.end, self.feature_period.start + 1):

30 expr = expr + ApproxBasketCount(

31 as_at=self.as_at,

32 feature_period=FeaturePeriod(

33 self.feature_period.period_unit_of_measure, period, period

34 ),

35 ).column * pow(self.smoothing_factor, period)

36 return expr

38 def default_value(self) -> Column:

39 return f.lit(None)

41 @property

42 def description_subject(self) -> str:

43 return (

44 "Exponentially weighted moving average, with smoothing factor of"

45 + f" {self.smoothing_factor}, of the approximate number of baskets"

46 + f" per {self.feature_period.period_unit_of_measure.name.lower()}"

47 )

49 @property

50 def feature_name(self) -> str:

51 return (

52 "RecencyWeightedApproxBasket"

53 + f"{self.feature_period.period_unit_of_measure.name.title()}s"

54 + f"{int(self.smoothing_factor*100)}"

55 + f"_{self.feature_period.mnemonic}"

56 )

58 @property

59 def commentary(self) -> str:

60 return (

61 "Exponential smoothing "

62 + "(https://en.wikipedia.org/wiki/Exponential_smoothing)"

63 + " is an alternative to a simple moving average which"

64 + " gives greater weighting to more recent observations, thus is an"

65 + " exponentially weighted moving average. It uses a smoothing factor"

66 + f" between 0 & 1 which for this feature is {self.smoothing_factor}."

67 + " Here the approximate number of baskets per"

68 + f" {self.feature_period.period_unit_of_measure.name.lower()} is smoothed."

69 + " This feature is considered to be a highly effective predictor of future"

70 + " purchases, if a customer has bought a product recently then there's a"

71 + " relatively high probability they will buy it again."

72 + f" This is less accurate than {self.feature_name.replace('Approx', '')}"

73 + " though is less computationally expensive to calculate because it "

74 + " does not calculate a distinct count for each"

75 + f" {self.feature_period.period_unit_of_measure.name.lower()}."

76 )

79class RecencyWeightedBasket(RecencyWeightedApproxBasket):

80 """RecencyWeightedBasket feature"""

82 def __init__(

83 self, as_at: date, feature_period: FeaturePeriod, smoothing_factor: float

84 ) -> None:

85 super().__init__(as_at, feature_period, smoothing_factor)

87 def column_expression(self) -> Column:

88 expr = f.lit(0.0)

89 for period in range(self.feature_period.end, self.feature_period.start + 1):

90 expr = expr + BasketCount(

91 as_at=self.as_at,

92 feature_period=FeaturePeriod(

93 self.feature_period.period_unit_of_measure, period, period

94 ),

95 ).column * pow(super().smoothing_factor, period)

96 return expr

98 @property

99 def description_subject(self) -> str:

100 """simply RecencyWeightedApproxBasketXX_periodmenmonic's description with the

101 word approximate removed"""

102 return super().description_subject.replace("approximate ", "")

103

104 @property

105 def commentary(self) -> str:

106 return (

107 "Exponential smoothing "

108 + "(https://en.wikipedia.org/wiki/Exponential_smoothing)"

109 + " is an alternative to a simple moving average which"

110 + " gives greater weighting to more recent observations, thus is an"

111 + " exponentially weighted moving average. It uses a smoothing factor"

112 + f" between 0 & 1 which for this feature is {self.smoothing_factor}."

113 + " Here the number of baskets per"

114 + f" {self.feature_period.period_unit_of_measure.name.lower()} is smoothed."

115 + " This feature is considered to be a highly effective predictor of future"

116 + " purchases, if a customer has bought a product recently then there's a"

117 + " relatively high probability they will buy it again."

118 + " This is computationally expensive to calculate because it "

119 + " requires a distinct count of baskets for each"

120 + f" {self.feature_period.period_unit_of_measure.name.lower()}. Every"

121 + " distinct count operation is expensive so the less that are performed,"

122 + " the better (YMMV based on a number of factors, "

123 + "mainly the volume of data"

124 + " being processed). For this reason you should consider choosing a small"

125 + f" number of {self.feature_period.period_unit_of_measure.name.lower()}s"

126 + " for the feature period. This feature"

127 + f" ({self.feature_name}) is for"

128 + f" {self.feature_period.number_of_periods}"

129 + f" {self.feature_period.period_unit_of_measure.name.lower()}"

130 + f"{'s' if self.feature_period.number_of_periods>1 else ''}. You might"

131 + f" consider using {self.feature_name.replace('Basket', 'ApproxBasket')}"

132 + " instead which is less accurate but computationally cheaper."

133 )

134

135 @property

136 def feature_name(self) -> str:

137 """simply RecencyWeightedApproxBasketXX_periodmenmonic's with the

138 word approximate removed"""

139 return super().feature_name.replace("Approx", "")

140

141

142class RecencyWeightedBasket90(RecencyWeightedBasket):

143 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:

144 super().__init__(as_at, feature_period, 0.9)

145

146

147class RecencyWeightedBasket95(RecencyWeightedBasket):

148 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:

149 super().__init__(as_at, feature_period, 0.95)

150

151

152class RecencyWeightedBasket99(RecencyWeightedBasket):

153 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:

154 super().__init__(as_at, feature_period, 0.99)

155

156

157class RecencyWeightedApproxBasket90(RecencyWeightedApproxBasket):

158 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:

159 super().__init__(as_at, feature_period, 0.9)

160

161

162class RecencyWeightedApproxBasket95(RecencyWeightedApproxBasket):

163 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:

164 super().__init__(as_at, feature_period, 0.95)

165

166

167class RecencyWeightedApproxBasket99(RecencyWeightedApproxBasket):

168 def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:

169 super().__init__(as_at, feature_period, 0.99)

Coverage for jstark/features/recency_weighted_basket.py: 100%

65 statements