Getting unique elements

Let us perform few tasks to understand how to extract unique elements. We can use either of these approaches.

  • We can create a list of elements first and then convert into a set.

  • We can also build set directly while extracting the information.

%%sh

ls -ltr /data/retail_db/orders/part-00000
-rw-r--r-- 1 root root 2999944 Nov 22 16:08 /data/retail_db/orders/part-00000
%%sh

tail /data/retail_db/orders/part-00000
68874,2014-07-03 00:00:00.0,1601,COMPLETE
68875,2014-07-04 00:00:00.0,10637,ON_HOLD
68876,2014-07-06 00:00:00.0,4124,COMPLETE
68877,2014-07-07 00:00:00.0,9692,ON_HOLD
68878,2014-07-08 00:00:00.0,6753,COMPLETE
68879,2014-07-09 00:00:00.0,778,COMPLETE
68880,2014-07-13 00:00:00.0,1117,COMPLETE
68881,2014-07-19 00:00:00.0,2518,PENDING_PAYMENT
68882,2014-07-22 00:00:00.0,10000,ON_HOLD
68883,2014-07-23 00:00:00.0,5533,COMPLETE
path = '/data/retail_db/orders/part-00000'
# C:\\users\\itversity\\Research\\data\\retail_db\\orders\\part-00000
orders_file = open(path)
type(orders_file)
_io.TextIOWrapper
orders_raw = orders_file.read()
type(orders_raw)
str
orders_raw.splitlines?
Docstring:
S.splitlines([keepends]) -> list of strings

Return a list of the lines in S, breaking at line boundaries.
Line breaks are not included in the resulting list unless keepends
is given and true.
Type:      builtin_function_or_method
orders = orders_raw.splitlines()
type(orders)
list
orders[:10]
['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE',
 '6,2013-07-25 00:00:00.0,7130,COMPLETE',
 '7,2013-07-25 00:00:00.0,4530,COMPLETE',
 '8,2013-07-25 00:00:00.0,2911,PROCESSING',
 '9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT',
 '10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT']
type(orders[0])
str
len(orders)
68883
%%sh

wc -l /data/retail_db/orders/part-00000
68883 /data/retail_db/orders/part-00000

Task 1

Get all the unique dates from orders data.

order = '1,2013-07-25 00:00:00.0,11599,CLOSED'
order.split(',')[1]
'2013-07-25 00:00:00.0'
order_dates = set()
order_dates.add('2013-07-25 00:00:00.0')
order_dates
{'2013-07-25 00:00:00.0'}
order_dates.add('2013-07-26 00:00:00.0')
order_dates
{'2013-07-25 00:00:00.0', '2013-07-26 00:00:00.0'}
order_dates.add('2013-07-25 00:00:00.0')
order_dates
{'2013-07-25 00:00:00.0', '2013-07-26 00:00:00.0'}
order_dates = set()
for order in orders:
    order_dates.add(order.split(',')[1])
list(order_dates)[:10]
['2013-12-04 00:00:00.0',
 '2014-02-15 00:00:00.0',
 '2014-04-17 00:00:00.0',
 '2014-05-10 00:00:00.0',
 '2014-02-03 00:00:00.0',
 '2013-11-12 00:00:00.0',
 '2014-06-21 00:00:00.0',
 '2014-01-14 00:00:00.0',
 '2013-10-09 00:00:00.0',
 '2013-11-27 00:00:00.0']
len(order_dates)
364
order_dates = {order.split(',')[1] for order in orders}
list(order_dates)[:10]
['2013-12-04 00:00:00.0',
 '2014-02-15 00:00:00.0',
 '2014-04-17 00:00:00.0',
 '2014-05-10 00:00:00.0',
 '2014-02-03 00:00:00.0',
 '2013-11-12 00:00:00.0',
 '2014-06-21 00:00:00.0',
 '2014-01-14 00:00:00.0',
 '2013-10-09 00:00:00.0',
 '2013-11-27 00:00:00.0']
len(order_dates)
364

Task 2

Get all the unique weekend dates from orders data.

order_date = '2014-01-25 00:00:00.0'
import datetime as dt
dt.datetime.strptime(order_date, '%Y-%m-%d %H:%M:%S.%f')
datetime.datetime(2014, 1, 25, 0, 0)
d = dt.datetime.strptime(order_date, '%Y-%m-%d %H:%M:%S.%f')
d.weekday?
Docstring:
Return the day of the week represented by the date.
Monday == 0 ... Sunday == 6
Type:      builtin_function_or_method
dt.datetime.strptime(order_date, '%Y-%m-%d %H:%M:%S.%f').weekday() # Returns 0 to 6 (for Monday to Sunday)
5
import calendar
list(calendar.day_name)
['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
calendar.day_name[5]
'Saturday'
calendar.day_name[dt.datetime.strptime(order_date, '%Y-%m-%d %H:%M:%S.%f').weekday()]
'Saturday'
calendar.day_abbr[dt.datetime.strptime(order_date, '%Y-%m-%d %H:%M:%S.%f').weekday()]
'Sat'
dt.datetime.strptime(order_date, '%Y-%m-%d %H:%M:%S.%f').weekday() in (5, 6)
True
import datetime as dt
def is_weekend(order_date):
    return dt.datetime.strptime(order_date, '%Y-%m-%d %H:%M:%S.%f').weekday() in (5, 6)
is_weekend('2014-01-25 00:00:00.0')
True
is_weekend('2014-01-22 00:00:00.0')
False
weekend_dates = set()
for order in orders:
    order_date = order.split(',')[1]
    if is_weekend(order_date):
        weekend_dates.add(order_date)
list(weekend_dates)[:10]
['2014-06-22 00:00:00.0',
 '2013-08-25 00:00:00.0',
 '2014-02-15 00:00:00.0',
 '2013-12-08 00:00:00.0',
 '2014-05-25 00:00:00.0',
 '2013-11-02 00:00:00.0',
 '2013-12-29 00:00:00.0',
 '2014-05-10 00:00:00.0',
 '2014-02-09 00:00:00.0',
 '2013-11-23 00:00:00.0']
len(weekend_dates)
103