Validate myReduceByKey

Let us perform few tasks to validate myReduceKey Function.

%run 04_develop_myMap.ipynb
%run 08_develop_myReduceByKey.ipynb
  • Use the function to get the count by date from orders.

orders_path = "/data/retail_db/orders/part-00000"
orders = open(orders_path). \
    read(). \
    splitlines()
orders[:10]
['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE',
 '6,2013-07-25 00:00:00.0,7130,COMPLETE',
 '7,2013-07-25 00:00:00.0,4530,COMPLETE',
 '8,2013-07-25 00:00:00.0,2911,PROCESSING',
 '9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT',
 '10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT']
orders_map = myMap(orders, 
                   lambda order: (order.split(',')[1], 1)
                  )
orders_map[:10]
[('2013-07-25 00:00:00.0', 1),
 ('2013-07-25 00:00:00.0', 1),
 ('2013-07-25 00:00:00.0', 1),
 ('2013-07-25 00:00:00.0', 1),
 ('2013-07-25 00:00:00.0', 1),
 ('2013-07-25 00:00:00.0', 1),
 ('2013-07-25 00:00:00.0', 1),
 ('2013-07-25 00:00:00.0', 1),
 ('2013-07-25 00:00:00.0', 1),
 ('2013-07-25 00:00:00.0', 1)]
order_count_by_date = myReduceByKey(orders_map, 
                                    lambda t, e: t + e
                                   )
order_count_by_date[:10]
[('2013-07-25 00:00:00.0', 143),
 ('2013-07-26 00:00:00.0', 269),
 ('2013-07-27 00:00:00.0', 202),
 ('2013-07-28 00:00:00.0', 187),
 ('2013-07-29 00:00:00.0', 253),
 ('2013-07-30 00:00:00.0', 227),
 ('2013-07-31 00:00:00.0', 252),
 ('2013-08-01 00:00:00.0', 246),
 ('2013-08-02 00:00:00.0', 224),
 ('2013-08-03 00:00:00.0', 183)]
  • Use the function to get the revenue for each order id.

order_items_path = "/data/retail_db/order_items/part-00000"
order_items = open(order_items_path). \
    read(). \
    splitlines()
order_items[:10]
['1,1,957,1,299.98,299.98',
 '2,2,1073,1,199.99,199.99',
 '3,2,502,5,250.0,50.0',
 '4,2,403,1,129.99,129.99',
 '5,4,897,2,49.98,24.99',
 '6,4,365,5,299.95,59.99',
 '7,4,502,3,150.0,50.0',
 '8,4,1014,4,199.92,49.98',
 '9,5,957,1,299.98,299.98',
 '10,5,365,5,299.95,59.99']
order_items_map = myMap(order_items,
                        lambda order_item: (int(order_item.split(',')[1]),
                                            float(order_item.split(',')[4])
                                           )
                       )
order_items_map[:10]
[(1, 299.98),
 (2, 199.99),
 (2, 250.0),
 (2, 129.99),
 (4, 49.98),
 (4, 299.95),
 (4, 150.0),
 (4, 199.92),
 (5, 299.98),
 (5, 299.95)]
revenue_per_order = myReduceByKey(order_items_map,
                                  lambda t, e: round(t + e, 2)
                                 )
revenue_per_order[:10]
[(1, 299.98),
 (2, 579.98),
 (4, 699.85),
 (5, 1129.86),
 (7, 579.92),
 (8, 729.84),
 (9, 599.96),
 (10, 651.92),
 (11, 919.79),
 (12, 1299.87)]
myReduceByKey(order_items_map,
              lambda t, e: min(t, e)
             )[:10]
[(1, 299.98),
 (2, 129.99),
 (4, 49.98),
 (5, 99.96),
 (7, 79.95),
 (8, 50.0),
 (9, 199.98),
 (10, 21.99),
 (11, 49.98),
 (12, 100.0)]
  • Use the function to get the revenue as well as the number of items for each order id.

order_items_map = myMap(order_items,
                        lambda order_item: (int(order_item.split(',')[1]),
                                            (float(order_item.split(',')[4]), 1)
                                           )
                       )
order_items_map[:10]
[(1, (299.98, 1)),
 (2, (199.99, 1)),
 (2, (250.0, 1)),
 (2, (129.99, 1)),
 (4, (49.98, 1)),
 (4, (299.95, 1)),
 (4, (150.0, 1)),
 (4, (199.92, 1)),
 (5, (299.98, 1)),
 (5, (299.95, 1))]
[2, [(199.99, 1), (250.0, 1), (129.99, 1)]]
[2, [(199.99, 1), (250.0, 1), (129.99, 1)]]
t1 = (199.99, 1)
t2 = (250.0, 1)
(t1[0] + t2[0], t1[1] + t2[1])
(449.99, 2)
myReduceByKey(order_items_map,
              lambda t, e: (round(t[0] + e[0], 2), t[1] + e[1])
             )[:10]
[(1, (299.98, 1)),
 (2, (579.98, 3)),
 (4, (699.85, 4)),
 (5, (1129.86, 5)),
 (7, (579.92, 3)),
 (8, (729.84, 4)),
 (9, (599.96, 3)),
 (10, (651.92, 5)),
 (11, (919.79, 5)),
 (12, (1299.87, 5))]