Coverage for jstark / sample / mealkit_orders.py: 100%

54 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-30 09:36 +0000

1import random 

2from functools import cached_property 

3import uuid 

4from datetime import date 

5from typing import Any, Iterable 

6from decimal import Decimal 

7from pyspark.sql import SparkSession, DataFrame 

8from pyspark.sql.types import ( 

9 IntegerType, 

10 StringType, 

11 StructField, 

12 StructType, 

13 TimestampType, 

14 DecimalType, 

15) 

16from faker import Faker 

17from faker.providers import DynamicProvider 

18 

19 

20class FakeMealkitOrders: 

21 def __init__(self, seed: int | None = None, number_of_orders: int = 1000): 

22 self.seed = seed 

23 self.number_of_orders = number_of_orders 

24 

25 @property 

26 def mealkit_orders_schema(self) -> StructType: 

27 return StructType( 

28 [ 

29 StructField("Timestamp", TimestampType(), True), 

30 StructField("Customer", StringType(), True), 

31 StructField("Product", StringType(), True), 

32 StructField("Recipe", StringType(), True), 

33 StructField("Cuisine", StringType(), True), 

34 StructField("Allergen", StringType(), True), 

35 StructField("Quantity", IntegerType(), True), 

36 StructField("Order", StringType(), True), 

37 StructField("Discount", DecimalType(10, 2), True), 

38 ] 

39 ) 

40 

41 @staticmethod 

42 def flatten_mealkit_orders(mealkit_orders: list[Any]) -> Iterable[dict[str, Any]]: 

43 return [ 

44 { 

45 "Customer": d["Customer"], 

46 "Product": d["Product"], 

47 "Cuisine": d["Cuisine"], 

48 "Allergen": d["Allergen"], 

49 "Order": d["Order"], 

50 "Timestamp": d["Timestamp"], 

51 **d2, 

52 } 

53 for d in mealkit_orders 

54 for d2 in d["Recipes"] 

55 ] 

56 

57 @cached_property 

58 def df(self) -> DataFrame: 

59 

60 products_provider = DynamicProvider( 

61 provider_name="product", 

62 elements=[ 

63 "classic-plan", 

64 "preset-box-bc", 

65 "balanced-living-t1", 

66 "dinner-box", 

67 "classic-plan-t11", 

68 "classic-plan-t12", 

69 "classic-plan-t13", 

70 "classic-plan-t14", 

71 "classic-plan-t15", 

72 ], 

73 ) 

74 recipes_provider = DynamicProvider( 

75 provider_name="recipe", 

76 elements=[ 

77 "Banging bangers and mash", 

78 "Fish and chips", 

79 "Pizza and salad", 

80 "Chicken curry", 

81 "Beef stew", 

82 "Vegetable lasagna", 

83 "Salad and bread", 

84 "Soup and bread", 

85 "Pasta and sauce", 

86 ], 

87 ) 

88 cuisines_provider = DynamicProvider( 

89 provider_name="cuisine", 

90 elements=[ 

91 "Italian", 

92 "French", 

93 "Spanish", 

94 ], 

95 ) 

96 

97 allergens_provider = DynamicProvider( 

98 provider_name="allergen", 

99 elements=[ 

100 "Gluten", 

101 "Eggs", 

102 "Milk", 

103 "Soy", 

104 ], 

105 ) 

106 

107 fake = Faker() 

108 if self.seed: 

109 Faker.seed(self.seed) 

110 

111 products_fake = Faker() 

112 products_fake.add_provider(products_provider) 

113 

114 recipes_fake = Faker() 

115 recipes_fake.add_provider(recipes_provider) 

116 

117 cuisines_fake = Faker() 

118 cuisines_fake.add_provider(cuisines_provider) 

119 

120 allergens_fake = Faker() 

121 allergens_fake.add_provider(allergens_provider) 

122 

123 mealkit_orders = [] 

124 

125 possible_quantities = [1, 2] 

126 if self.seed: 

127 random.seed(self.seed) 

128 quantities = random.choices( 

129 possible_quantities, 

130 weights=[100, 1], 

131 k=self.number_of_orders * len(recipes_provider.elements), 

132 ) 

133 for order in range(self.number_of_orders): 

134 recipes = [] 

135 # if self.seed: 

136 # random.seed(self.seed) 

137 for recipe_index in range(random.randint(2, 5)): 

138 r = recipes_fake.unique.recipe() 

139 quantity = quantities[(order * len(possible_quantities)) + recipe_index] 

140 recipes.append( 

141 { 

142 "Recipe": r, 

143 "Quantity": quantity, 

144 } 

145 ) 

146 mealkit_orders.append( 

147 { 

148 "Customer": fake.name(), 

149 "Timestamp": fake.date_time_between( 

150 start_date=date(2021, 1, 1), end_date=date(2021, 12, 31) 

151 ), 

152 "Order": str(uuid.uuid4()), 

153 "Product": products_fake.product(), 

154 "Cuisine": cuisines_fake.cuisine(), 

155 "Allergen": allergens_fake.allergen(), 

156 "Recipes": recipes, 

157 "Discount": Decimal(random.uniform(0, 5)), 

158 } 

159 ) 

160 recipes_fake.unique.clear() 

161 cuisines_fake.unique.clear() 

162 flattened_mealkit_orders = self.flatten_mealkit_orders(mealkit_orders) 

163 spark = SparkSession.builder.getOrCreate() 

164 return spark.createDataFrame( 

165 flattened_mealkit_orders, 

166 schema=self.mealkit_orders_schema, # type: ignore 

167 )