{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Joining Data Frames\n", "\n", "Let us understand how to join Data Frames using Pandas." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "tags": [ "remove-cell" ] }, "outputs": [ { "data": { "text/html": [ "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%HTML\n", "" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%run 06_csv_to_pandas_data_frame.ipynb" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_idorder_dateorder_customer_idorder_status
012013-07-25 00:00:00.011599CLOSED
122013-07-25 00:00:00.0256PENDING_PAYMENT
232013-07-25 00:00:00.012111COMPLETE
342013-07-25 00:00:00.08827CLOSED
452013-07-25 00:00:00.011318COMPLETE
...............
68878688792014-07-09 00:00:00.0778COMPLETE
68879688802014-07-13 00:00:00.01117COMPLETE
68880688812014-07-19 00:00:00.02518PENDING_PAYMENT
68881688822014-07-22 00:00:00.010000ON_HOLD
68882688832014-07-23 00:00:00.05533COMPLETE
\n", "

68883 rows × 4 columns

\n", "
" ], "text/plain": [ " order_id order_date order_customer_id order_status\n", "0 1 2013-07-25 00:00:00.0 11599 CLOSED\n", "1 2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT\n", "2 3 2013-07-25 00:00:00.0 12111 COMPLETE\n", "3 4 2013-07-25 00:00:00.0 8827 CLOSED\n", "4 5 2013-07-25 00:00:00.0 11318 COMPLETE\n", "... ... ... ... ...\n", "68878 68879 2014-07-09 00:00:00.0 778 COMPLETE\n", "68879 68880 2014-07-13 00:00:00.0 1117 COMPLETE\n", "68880 68881 2014-07-19 00:00:00.0 2518 PENDING_PAYMENT\n", "68881 68882 2014-07-22 00:00:00.0 10000 ON_HOLD\n", "68882 68883 2014-07-23 00:00:00.0 5533 COMPLETE\n", "\n", "[68883 rows x 4 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orders" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_item_idorder_item_order_idorder_item_product_idorder_item_quantityorder_item_subtotalorder_item_product_price
0119571299.98299.98
12210731199.99199.99
2325025250.0050.00
3424031129.99129.99
454897249.9824.99
.....................
172193172194688814031129.99129.99
17219417219568882365159.9959.99
17219517219668882502150.0050.00
1721961721976888320811999.991999.99
172197172198688835023150.0050.00
\n", "

172198 rows × 6 columns

\n", "
" ], "text/plain": [ " order_item_id order_item_order_id order_item_product_id \\\n", "0 1 1 957 \n", "1 2 2 1073 \n", "2 3 2 502 \n", "3 4 2 403 \n", "4 5 4 897 \n", "... ... ... ... \n", "172193 172194 68881 403 \n", "172194 172195 68882 365 \n", "172195 172196 68882 502 \n", "172196 172197 68883 208 \n", "172197 172198 68883 502 \n", "\n", " order_item_quantity order_item_subtotal order_item_product_price \n", "0 1 299.98 299.98 \n", "1 1 199.99 199.99 \n", "2 5 250.00 50.00 \n", "3 1 129.99 129.99 \n", "4 2 49.98 24.99 \n", "... ... ... ... \n", "172193 1 129.99 129.99 \n", "172194 1 59.99 59.99 \n", "172195 1 50.00 50.00 \n", "172196 1 1999.99 1999.99 \n", "172197 3 150.00 50.00 \n", "\n", "[172198 rows x 6 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "order_items" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Join orders and order_items using orders.order_id and order_items.order_item_order_id." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\u001b[0;31mSignature:\u001b[0m \u001b[0morders\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mother\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mon\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhow\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'left'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlsuffix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrsuffix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msort\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;34m'DataFrame'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mDocstring:\u001b[0m\n", "Join columns of another DataFrame.\n", "\n", "Join columns with `other` DataFrame either on index or on a key\n", "column. Efficiently join multiple DataFrame objects by index at once by\n", "passing a list.\n", "\n", "Parameters\n", "----------\n", "other : DataFrame, Series, or list of DataFrame\n", " Index should be similar to one of the columns in this one. If a\n", " Series is passed, its name attribute must be set, and that will be\n", " used as the column name in the resulting joined DataFrame.\n", "on : str, list of str, or array-like, optional\n", " Column or index level name(s) in the caller to join on the index\n", " in `other`, otherwise joins index-on-index. If multiple\n", " values given, the `other` DataFrame must have a MultiIndex. Can\n", " pass an array as the join key if it is not already contained in\n", " the calling DataFrame. Like an Excel VLOOKUP operation.\n", "how : {'left', 'right', 'outer', 'inner'}, default 'left'\n", " How to handle the operation of the two objects.\n", "\n", " * left: use calling frame's index (or column if on is specified)\n", " * right: use `other`'s index.\n", " * outer: form union of calling frame's index (or column if on is\n", " specified) with `other`'s index, and sort it.\n", " lexicographically.\n", " * inner: form intersection of calling frame's index (or column if\n", " on is specified) with `other`'s index, preserving the order\n", " of the calling's one.\n", "lsuffix : str, default ''\n", " Suffix to use from left frame's overlapping columns.\n", "rsuffix : str, default ''\n", " Suffix to use from right frame's overlapping columns.\n", "sort : bool, default False\n", " Order result DataFrame lexicographically by the join key. If False,\n", " the order of the join key depends on the join type (how keyword).\n", "\n", "Returns\n", "-------\n", "DataFrame\n", " A dataframe containing columns from both the caller and `other`.\n", "\n", "See Also\n", "--------\n", "DataFrame.merge : For column(s)-on-columns(s) operations.\n", "\n", "Notes\n", "-----\n", "Parameters `on`, `lsuffix`, and `rsuffix` are not supported when\n", "passing a list of `DataFrame` objects.\n", "\n", "Support for specifying index levels as the `on` parameter was added\n", "in version 0.23.0.\n", "\n", "Examples\n", "--------\n", ">>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],\n", "... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})\n", "\n", ">>> df\n", " key A\n", "0 K0 A0\n", "1 K1 A1\n", "2 K2 A2\n", "3 K3 A3\n", "4 K4 A4\n", "5 K5 A5\n", "\n", ">>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],\n", "... 'B': ['B0', 'B1', 'B2']})\n", "\n", ">>> other\n", " key B\n", "0 K0 B0\n", "1 K1 B1\n", "2 K2 B2\n", "\n", "Join DataFrames using their indexes.\n", "\n", ">>> df.join(other, lsuffix='_caller', rsuffix='_other')\n", " key_caller A key_other B\n", "0 K0 A0 K0 B0\n", "1 K1 A1 K1 B1\n", "2 K2 A2 K2 B2\n", "3 K3 A3 NaN NaN\n", "4 K4 A4 NaN NaN\n", "5 K5 A5 NaN NaN\n", "\n", "If we want to join using the key columns, we need to set key to be\n", "the index in both `df` and `other`. The joined DataFrame will have\n", "key as its index.\n", "\n", ">>> df.set_index('key').join(other.set_index('key'))\n", " A B\n", "key\n", "K0 A0 B0\n", "K1 A1 B1\n", "K2 A2 B2\n", "K3 A3 NaN\n", "K4 A4 NaN\n", "K5 A5 NaN\n", "\n", "Another option to join using the key columns is to use the `on`\n", "parameter. DataFrame.join always uses `other`'s index but we can use\n", "any column in `df`. This method preserves the original DataFrame's\n", "index in the result.\n", "\n", ">>> df.join(other.set_index('key'), on='key')\n", " key A B\n", "0 K0 A0 B0\n", "1 K1 A1 B1\n", "2 K2 A2 B2\n", "3 K3 A3 NaN\n", "4 K4 A4 NaN\n", "5 K5 A5 NaN\n", "\u001b[0;31mFile:\u001b[0m /opt/anaconda3/envs/beakerx/lib/python3.6/site-packages/pandas/core/frame.py\n", "\u001b[0;31mType:\u001b[0m method\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "orders.join?" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_dateorder_customer_idorder_status
order_id
12013-07-25 00:00:00.011599CLOSED
22013-07-25 00:00:00.0256PENDING_PAYMENT
32013-07-25 00:00:00.012111COMPLETE
42013-07-25 00:00:00.08827CLOSED
52013-07-25 00:00:00.011318COMPLETE
............
688792014-07-09 00:00:00.0778COMPLETE
688802014-07-13 00:00:00.01117COMPLETE
688812014-07-19 00:00:00.02518PENDING_PAYMENT
688822014-07-22 00:00:00.010000ON_HOLD
688832014-07-23 00:00:00.05533COMPLETE
\n", "

68883 rows × 3 columns

\n", "
" ], "text/plain": [ " order_date order_customer_id order_status\n", "order_id \n", "1 2013-07-25 00:00:00.0 11599 CLOSED\n", "2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT\n", "3 2013-07-25 00:00:00.0 12111 COMPLETE\n", "4 2013-07-25 00:00:00.0 8827 CLOSED\n", "5 2013-07-25 00:00:00.0 11318 COMPLETE\n", "... ... ... ...\n", "68879 2014-07-09 00:00:00.0 778 COMPLETE\n", "68880 2014-07-13 00:00:00.0 1117 COMPLETE\n", "68881 2014-07-19 00:00:00.0 2518 PENDING_PAYMENT\n", "68882 2014-07-22 00:00:00.0 10000 ON_HOLD\n", "68883 2014-07-23 00:00:00.0 5533 COMPLETE\n", "\n", "[68883 rows x 3 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orders.set_index('order_id')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_item_idorder_item_product_idorder_item_quantityorder_item_subtotalorder_item_product_price
order_item_order_id
119571299.98299.98
2210731199.99199.99
235025250.0050.00
244031129.99129.99
45897249.9824.99
..................
688811721944031129.99129.99
68882172195365159.9959.99
68882172196502150.0050.00
6888317219720811999.991999.99
688831721985023150.0050.00
\n", "

172198 rows × 5 columns

\n", "
" ], "text/plain": [ " order_item_id order_item_product_id \\\n", "order_item_order_id \n", "1 1 957 \n", "2 2 1073 \n", "2 3 502 \n", "2 4 403 \n", "4 5 897 \n", "... ... ... \n", "68881 172194 403 \n", "68882 172195 365 \n", "68882 172196 502 \n", "68883 172197 208 \n", "68883 172198 502 \n", "\n", " order_item_quantity order_item_subtotal \\\n", "order_item_order_id \n", "1 1 299.98 \n", "2 1 199.99 \n", "2 5 250.00 \n", "2 1 129.99 \n", "4 2 49.98 \n", "... ... ... \n", "68881 1 129.99 \n", "68882 1 59.99 \n", "68882 1 50.00 \n", "68883 1 1999.99 \n", "68883 3 150.00 \n", "\n", " order_item_product_price \n", "order_item_order_id \n", "1 299.98 \n", "2 199.99 \n", "2 50.00 \n", "2 129.99 \n", "4 24.99 \n", "... ... \n", "68881 129.99 \n", "68882 59.99 \n", "68882 50.00 \n", "68883 1999.99 \n", "68883 50.00 \n", "\n", "[172198 rows x 5 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "order_items.set_index('order_item_order_id')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_dateorder_customer_idorder_statusorder_item_idorder_item_product_idorder_item_quantityorder_item_subtotalorder_item_product_price
12013-07-25 00:00:00.011599CLOSED1.0957.01.0299.98299.98
22013-07-25 00:00:00.0256PENDING_PAYMENT2.01073.01.0199.99199.99
22013-07-25 00:00:00.0256PENDING_PAYMENT3.0502.05.0250.0050.00
22013-07-25 00:00:00.0256PENDING_PAYMENT4.0403.01.0129.99129.99
32013-07-25 00:00:00.012111COMPLETENaNNaNNaNNaNNaN
...........................
688812014-07-19 00:00:00.02518PENDING_PAYMENT172194.0403.01.0129.99129.99
688822014-07-22 00:00:00.010000ON_HOLD172195.0365.01.059.9959.99
688822014-07-22 00:00:00.010000ON_HOLD172196.0502.01.050.0050.00
688832014-07-23 00:00:00.05533COMPLETE172197.0208.01.01999.991999.99
688832014-07-23 00:00:00.05533COMPLETE172198.0502.03.0150.0050.00
\n", "

183650 rows × 8 columns

\n", "
" ], "text/plain": [ " order_date order_customer_id order_status \\\n", "1 2013-07-25 00:00:00.0 11599 CLOSED \n", "2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT \n", "2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT \n", "2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT \n", "3 2013-07-25 00:00:00.0 12111 COMPLETE \n", "... ... ... ... \n", "68881 2014-07-19 00:00:00.0 2518 PENDING_PAYMENT \n", "68882 2014-07-22 00:00:00.0 10000 ON_HOLD \n", "68882 2014-07-22 00:00:00.0 10000 ON_HOLD \n", "68883 2014-07-23 00:00:00.0 5533 COMPLETE \n", "68883 2014-07-23 00:00:00.0 5533 COMPLETE \n", "\n", " order_item_id order_item_product_id order_item_quantity \\\n", "1 1.0 957.0 1.0 \n", "2 2.0 1073.0 1.0 \n", "2 3.0 502.0 5.0 \n", "2 4.0 403.0 1.0 \n", "3 NaN NaN NaN \n", "... ... ... ... \n", "68881 172194.0 403.0 1.0 \n", "68882 172195.0 365.0 1.0 \n", "68882 172196.0 502.0 1.0 \n", "68883 172197.0 208.0 1.0 \n", "68883 172198.0 502.0 3.0 \n", "\n", " order_item_subtotal order_item_product_price \n", "1 299.98 299.98 \n", "2 199.99 199.99 \n", "2 250.00 50.00 \n", "2 129.99 129.99 \n", "3 NaN NaN \n", "... ... ... \n", "68881 129.99 129.99 \n", "68882 59.99 59.99 \n", "68882 50.00 50.00 \n", "68883 1999.99 1999.99 \n", "68883 150.00 50.00 \n", "\n", "[183650 rows x 8 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Join orders and order_items using order_id (order_item_order_id from order_items)\n", "orders.set_index('order_id'). \\\n", " join(order_items.set_index('order_item_order_id'))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_dateorder_customer_idorder_statusorder_item_idorder_item_product_idorder_item_quantityorder_item_subtotalorder_item_product_price
12013-07-25 00:00:00.011599CLOSED19571299.98299.98
22013-07-25 00:00:00.0256PENDING_PAYMENT210731199.99199.99
22013-07-25 00:00:00.0256PENDING_PAYMENT35025250.0050.00
22013-07-25 00:00:00.0256PENDING_PAYMENT44031129.99129.99
42013-07-25 00:00:00.08827CLOSED5897249.9824.99
...........................
688812014-07-19 00:00:00.02518PENDING_PAYMENT1721944031129.99129.99
688822014-07-22 00:00:00.010000ON_HOLD172195365159.9959.99
688822014-07-22 00:00:00.010000ON_HOLD172196502150.0050.00
688832014-07-23 00:00:00.05533COMPLETE17219720811999.991999.99
688832014-07-23 00:00:00.05533COMPLETE1721985023150.0050.00
\n", "

172198 rows × 8 columns

\n", "
" ], "text/plain": [ " order_date order_customer_id order_status \\\n", "1 2013-07-25 00:00:00.0 11599 CLOSED \n", "2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT \n", "2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT \n", "2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT \n", "4 2013-07-25 00:00:00.0 8827 CLOSED \n", "... ... ... ... \n", "68881 2014-07-19 00:00:00.0 2518 PENDING_PAYMENT \n", "68882 2014-07-22 00:00:00.0 10000 ON_HOLD \n", "68882 2014-07-22 00:00:00.0 10000 ON_HOLD \n", "68883 2014-07-23 00:00:00.0 5533 COMPLETE \n", "68883 2014-07-23 00:00:00.0 5533 COMPLETE \n", "\n", " order_item_id order_item_product_id order_item_quantity \\\n", "1 1 957 1 \n", "2 2 1073 1 \n", "2 3 502 5 \n", "2 4 403 1 \n", "4 5 897 2 \n", "... ... ... ... \n", "68881 172194 403 1 \n", "68882 172195 365 1 \n", "68882 172196 502 1 \n", "68883 172197 208 1 \n", "68883 172198 502 3 \n", "\n", " order_item_subtotal order_item_product_price \n", "1 299.98 299.98 \n", "2 199.99 199.99 \n", "2 250.00 50.00 \n", "2 129.99 129.99 \n", "4 49.98 24.99 \n", "... ... ... \n", "68881 129.99 129.99 \n", "68882 59.99 59.99 \n", "68882 50.00 50.00 \n", "68883 1999.99 1999.99 \n", "68883 150.00 50.00 \n", "\n", "[172198 rows x 8 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orders.set_index('order_id'). \\\n", " join(order_items.set_index('order_item_order_id'), how='inner')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexorder_dateorder_customer_idorder_statusorder_item_idorder_item_product_idorder_item_quantityorder_item_subtotalorder_item_product_price
012013-07-25 00:00:00.011599CLOSED19571299.98299.98
122013-07-25 00:00:00.0256PENDING_PAYMENT210731199.99199.99
222013-07-25 00:00:00.0256PENDING_PAYMENT35025250.0050.00
322013-07-25 00:00:00.0256PENDING_PAYMENT44031129.99129.99
442013-07-25 00:00:00.08827CLOSED5897249.9824.99
..............................
172193688812014-07-19 00:00:00.02518PENDING_PAYMENT1721944031129.99129.99
172194688822014-07-22 00:00:00.010000ON_HOLD172195365159.9959.99
172195688822014-07-22 00:00:00.010000ON_HOLD172196502150.0050.00
172196688832014-07-23 00:00:00.05533COMPLETE17219720811999.991999.99
172197688832014-07-23 00:00:00.05533COMPLETE1721985023150.0050.00
\n", "

172198 rows × 9 columns

\n", "
" ], "text/plain": [ " index order_date order_customer_id order_status \\\n", "0 1 2013-07-25 00:00:00.0 11599 CLOSED \n", "1 2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT \n", "2 2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT \n", "3 2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT \n", "4 4 2013-07-25 00:00:00.0 8827 CLOSED \n", "... ... ... ... ... \n", "172193 68881 2014-07-19 00:00:00.0 2518 PENDING_PAYMENT \n", "172194 68882 2014-07-22 00:00:00.0 10000 ON_HOLD \n", "172195 68882 2014-07-22 00:00:00.0 10000 ON_HOLD \n", "172196 68883 2014-07-23 00:00:00.0 5533 COMPLETE \n", "172197 68883 2014-07-23 00:00:00.0 5533 COMPLETE \n", "\n", " order_item_id order_item_product_id order_item_quantity \\\n", "0 1 957 1 \n", "1 2 1073 1 \n", "2 3 502 5 \n", "3 4 403 1 \n", "4 5 897 2 \n", "... ... ... ... \n", "172193 172194 403 1 \n", "172194 172195 365 1 \n", "172195 172196 502 1 \n", "172196 172197 208 1 \n", "172197 172198 502 3 \n", "\n", " order_item_subtotal order_item_product_price \n", "0 299.98 299.98 \n", "1 199.99 199.99 \n", "2 250.00 50.00 \n", "3 129.99 129.99 \n", "4 49.98 24.99 \n", "... ... ... \n", "172193 129.99 129.99 \n", "172194 59.99 59.99 \n", "172195 50.00 50.00 \n", "172196 1999.99 1999.99 \n", "172197 150.00 50.00 \n", "\n", "[172198 rows x 9 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orders.set_index('order_id'). \\\n", " join(order_items.set_index('order_item_order_id'), how='inner'). \\\n", " reset_index()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_idorder_dateorder_customer_idorder_statusorder_item_idorder_item_product_idorder_item_quantityorder_item_subtotalorder_item_product_price
012013-07-25 00:00:00.011599CLOSED19571299.98299.98
122013-07-25 00:00:00.0256PENDING_PAYMENT210731199.99199.99
222013-07-25 00:00:00.0256PENDING_PAYMENT35025250.0050.00
322013-07-25 00:00:00.0256PENDING_PAYMENT44031129.99129.99
442013-07-25 00:00:00.08827CLOSED5897249.9824.99
..............................
172193688812014-07-19 00:00:00.02518PENDING_PAYMENT1721944031129.99129.99
172194688822014-07-22 00:00:00.010000ON_HOLD172195365159.9959.99
172195688822014-07-22 00:00:00.010000ON_HOLD172196502150.0050.00
172196688832014-07-23 00:00:00.05533COMPLETE17219720811999.991999.99
172197688832014-07-23 00:00:00.05533COMPLETE1721985023150.0050.00
\n", "

172198 rows × 9 columns

\n", "
" ], "text/plain": [ " order_id order_date order_customer_id order_status \\\n", "0 1 2013-07-25 00:00:00.0 11599 CLOSED \n", "1 2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT \n", "2 2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT \n", "3 2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT \n", "4 4 2013-07-25 00:00:00.0 8827 CLOSED \n", "... ... ... ... ... \n", "172193 68881 2014-07-19 00:00:00.0 2518 PENDING_PAYMENT \n", "172194 68882 2014-07-22 00:00:00.0 10000 ON_HOLD \n", "172195 68882 2014-07-22 00:00:00.0 10000 ON_HOLD \n", "172196 68883 2014-07-23 00:00:00.0 5533 COMPLETE \n", "172197 68883 2014-07-23 00:00:00.0 5533 COMPLETE \n", "\n", " order_item_id order_item_product_id order_item_quantity \\\n", "0 1 957 1 \n", "1 2 1073 1 \n", "2 3 502 5 \n", "3 4 403 1 \n", "4 5 897 2 \n", "... ... ... ... \n", "172193 172194 403 1 \n", "172194 172195 365 1 \n", "172195 172196 502 1 \n", "172196 172197 208 1 \n", "172197 172198 502 3 \n", "\n", " order_item_subtotal order_item_product_price \n", "0 299.98 299.98 \n", "1 199.99 199.99 \n", "2 250.00 50.00 \n", "3 129.99 129.99 \n", "4 49.98 24.99 \n", "... ... ... \n", "172193 129.99 129.99 \n", "172194 59.99 59.99 \n", "172195 50.00 50.00 \n", "172196 1999.99 1999.99 \n", "172197 150.00 50.00 \n", "\n", "[172198 rows x 9 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orders.set_index('order_id'). \\\n", " join(order_items.set_index('order_item_order_id'), how='inner'). \\\n", " reset_index(). \\\n", " rename(columns={'index': 'order_id'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Task 1\n", "Compute Daily Revenue using orders.order_date and order_items.order_item_order_subtotal considering only COMPLETE and CLOSED orders.\n", "* Here are the steps to join orders and order_items and get daily revenue.\n", " * Create Data Frames for both orders and order_items using data in files.\n", " * Filter for orders which are either in **COMPLETE** or **CLOSED** status.\n", " * Set the join index for both the Data Frames.\n", " * Join both the Data Frames using `inner`.\n", " * Group the join results using **order_date** and get daily revenue by using `sum` on top of **order_item_subtotal**." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "orders_considered = orders.query(\"order_status in ('COMPLETE', 'CLOSED')\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "orders_filtered = orders[orders.order_status.isin([\"COMPLETE\", \"CLOSED\"])]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
revenue
order_date
2013-07-25 00:00:00.031547.23
2013-07-26 00:00:00.054713.23
2013-07-27 00:00:00.048411.48
2013-07-28 00:00:00.035672.03
2013-07-29 00:00:00.054579.70
......
2014-07-20 00:00:00.060047.45
2014-07-21 00:00:00.051427.70
2014-07-22 00:00:00.036717.24
2014-07-23 00:00:00.038795.23
2014-07-24 00:00:00.050885.19
\n", "

364 rows × 1 columns

\n", "
" ], "text/plain": [ " revenue\n", "order_date \n", "2013-07-25 00:00:00.0 31547.23\n", "2013-07-26 00:00:00.0 54713.23\n", "2013-07-27 00:00:00.0 48411.48\n", "2013-07-28 00:00:00.0 35672.03\n", "2013-07-29 00:00:00.0 54579.70\n", "... ...\n", "2014-07-20 00:00:00.0 60047.45\n", "2014-07-21 00:00:00.0 51427.70\n", "2014-07-22 00:00:00.0 36717.24\n", "2014-07-23 00:00:00.0 38795.23\n", "2014-07-24 00:00:00.0 50885.19\n", "\n", "[364 rows x 1 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orders_considered. \\\n", " set_index('order_id'). \\\n", " join(order_items.set_index('order_item_order_id'), how='inner'). \\\n", " groupby('order_date')['order_item_subtotal']. \\\n", " agg(['sum']). \\\n", " rename(columns={'sum': 'revenue'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Task 2\n", "Get all the orders for which there are no corresponding order items.\n", "* We can use default join (`left`) to get orders with out corresponding order items." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_dateorder_customer_idorder_statusorder_item_idorder_item_product_idorder_item_quantityorder_item_subtotalorder_item_product_price
12013-07-25 00:00:00.011599CLOSED1.0957.01.0299.98299.98
22013-07-25 00:00:00.0256PENDING_PAYMENT2.01073.01.0199.99199.99
22013-07-25 00:00:00.0256PENDING_PAYMENT3.0502.05.0250.0050.00
22013-07-25 00:00:00.0256PENDING_PAYMENT4.0403.01.0129.99129.99
32013-07-25 00:00:00.012111COMPLETENaNNaNNaNNaNNaN
...........................
688812014-07-19 00:00:00.02518PENDING_PAYMENT172194.0403.01.0129.99129.99
688822014-07-22 00:00:00.010000ON_HOLD172195.0365.01.059.9959.99
688822014-07-22 00:00:00.010000ON_HOLD172196.0502.01.050.0050.00
688832014-07-23 00:00:00.05533COMPLETE172197.0208.01.01999.991999.99
688832014-07-23 00:00:00.05533COMPLETE172198.0502.03.0150.0050.00
\n", "

183650 rows × 8 columns

\n", "
" ], "text/plain": [ " order_date order_customer_id order_status \\\n", "1 2013-07-25 00:00:00.0 11599 CLOSED \n", "2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT \n", "2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT \n", "2 2013-07-25 00:00:00.0 256 PENDING_PAYMENT \n", "3 2013-07-25 00:00:00.0 12111 COMPLETE \n", "... ... ... ... \n", "68881 2014-07-19 00:00:00.0 2518 PENDING_PAYMENT \n", "68882 2014-07-22 00:00:00.0 10000 ON_HOLD \n", "68882 2014-07-22 00:00:00.0 10000 ON_HOLD \n", "68883 2014-07-23 00:00:00.0 5533 COMPLETE \n", "68883 2014-07-23 00:00:00.0 5533 COMPLETE \n", "\n", " order_item_id order_item_product_id order_item_quantity \\\n", "1 1.0 957.0 1.0 \n", "2 2.0 1073.0 1.0 \n", "2 3.0 502.0 5.0 \n", "2 4.0 403.0 1.0 \n", "3 NaN NaN NaN \n", "... ... ... ... \n", "68881 172194.0 403.0 1.0 \n", "68882 172195.0 365.0 1.0 \n", "68882 172196.0 502.0 1.0 \n", "68883 172197.0 208.0 1.0 \n", "68883 172198.0 502.0 3.0 \n", "\n", " order_item_subtotal order_item_product_price \n", "1 299.98 299.98 \n", "2 199.99 199.99 \n", "2 250.00 50.00 \n", "2 129.99 129.99 \n", "3 NaN NaN \n", "... ... ... \n", "68881 129.99 129.99 \n", "68882 59.99 59.99 \n", "68882 50.00 50.00 \n", "68883 1999.99 1999.99 \n", "68883 150.00 50.00 \n", "\n", "[183650 rows x 8 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orders.set_index('order_id'). \\\n", " join(order_items.set_index('order_item_order_id'))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_dateorder_customer_idorder_statusorder_item_idorder_item_product_idorder_item_quantityorder_item_subtotalorder_item_product_price
32013-07-25 00:00:00.012111COMPLETENaNNaNNaNNaNNaN
62013-07-25 00:00:00.07130COMPLETENaNNaNNaNNaNNaN
222013-07-25 00:00:00.0333COMPLETENaNNaNNaNNaNNaN
262013-07-25 00:00:00.07562COMPLETENaNNaNNaNNaNNaN
322013-07-25 00:00:00.03960COMPLETENaNNaNNaNNaNNaN
...........................
688672014-06-23 00:00:00.0869CANCELEDNaNNaNNaNNaNNaN
688722014-06-29 00:00:00.03354COMPLETENaNNaNNaNNaNNaN
688742014-07-03 00:00:00.01601COMPLETENaNNaNNaNNaNNaN
688762014-07-06 00:00:00.04124COMPLETENaNNaNNaNNaNNaN
688772014-07-07 00:00:00.09692ON_HOLDNaNNaNNaNNaNNaN
\n", "

11452 rows × 8 columns

\n", "
" ], "text/plain": [ " order_date order_customer_id order_status order_item_id \\\n", "3 2013-07-25 00:00:00.0 12111 COMPLETE NaN \n", "6 2013-07-25 00:00:00.0 7130 COMPLETE NaN \n", "22 2013-07-25 00:00:00.0 333 COMPLETE NaN \n", "26 2013-07-25 00:00:00.0 7562 COMPLETE NaN \n", "32 2013-07-25 00:00:00.0 3960 COMPLETE NaN \n", "... ... ... ... ... \n", "68867 2014-06-23 00:00:00.0 869 CANCELED NaN \n", "68872 2014-06-29 00:00:00.0 3354 COMPLETE NaN \n", "68874 2014-07-03 00:00:00.0 1601 COMPLETE NaN \n", "68876 2014-07-06 00:00:00.0 4124 COMPLETE NaN \n", "68877 2014-07-07 00:00:00.0 9692 ON_HOLD NaN \n", "\n", " order_item_product_id order_item_quantity order_item_subtotal \\\n", "3 NaN NaN NaN \n", "6 NaN NaN NaN \n", "22 NaN NaN NaN \n", "26 NaN NaN NaN \n", "32 NaN NaN NaN \n", "... ... ... ... \n", "68867 NaN NaN NaN \n", "68872 NaN NaN NaN \n", "68874 NaN NaN NaN \n", "68876 NaN NaN NaN \n", "68877 NaN NaN NaN \n", "\n", " order_item_product_price \n", "3 NaN \n", "6 NaN \n", "22 NaN \n", "26 NaN \n", "32 NaN \n", "... ... \n", "68867 NaN \n", "68872 NaN \n", "68874 NaN \n", "68876 NaN \n", "68877 NaN \n", "\n", "[11452 rows x 8 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orders.set_index('order_id'). \\\n", " join(order_items.set_index('order_item_order_id')). \\\n", " query('order_item_id.isna()')" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "orders_joined = orders.set_index('order_id'). \\\n", " join(order_items.set_index('order_item_order_id'))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_dateorder_customer_idorder_statusorder_item_idorder_item_product_idorder_item_quantityorder_item_subtotalorder_item_product_price
32013-07-25 00:00:00.012111COMPLETENaNNaNNaNNaNNaN
62013-07-25 00:00:00.07130COMPLETENaNNaNNaNNaNNaN
222013-07-25 00:00:00.0333COMPLETENaNNaNNaNNaNNaN
262013-07-25 00:00:00.07562COMPLETENaNNaNNaNNaNNaN
322013-07-25 00:00:00.03960COMPLETENaNNaNNaNNaNNaN
...........................
688672014-06-23 00:00:00.0869CANCELEDNaNNaNNaNNaNNaN
688722014-06-29 00:00:00.03354COMPLETENaNNaNNaNNaNNaN
688742014-07-03 00:00:00.01601COMPLETENaNNaNNaNNaNNaN
688762014-07-06 00:00:00.04124COMPLETENaNNaNNaNNaNNaN
688772014-07-07 00:00:00.09692ON_HOLDNaNNaNNaNNaNNaN
\n", "

11452 rows × 8 columns

\n", "
" ], "text/plain": [ " order_date order_customer_id order_status order_item_id \\\n", "3 2013-07-25 00:00:00.0 12111 COMPLETE NaN \n", "6 2013-07-25 00:00:00.0 7130 COMPLETE NaN \n", "22 2013-07-25 00:00:00.0 333 COMPLETE NaN \n", "26 2013-07-25 00:00:00.0 7562 COMPLETE NaN \n", "32 2013-07-25 00:00:00.0 3960 COMPLETE NaN \n", "... ... ... ... ... \n", "68867 2014-06-23 00:00:00.0 869 CANCELED NaN \n", "68872 2014-06-29 00:00:00.0 3354 COMPLETE NaN \n", "68874 2014-07-03 00:00:00.0 1601 COMPLETE NaN \n", "68876 2014-07-06 00:00:00.0 4124 COMPLETE NaN \n", "68877 2014-07-07 00:00:00.0 9692 ON_HOLD NaN \n", "\n", " order_item_product_id order_item_quantity order_item_subtotal \\\n", "3 NaN NaN NaN \n", "6 NaN NaN NaN \n", "22 NaN NaN NaN \n", "26 NaN NaN NaN \n", "32 NaN NaN NaN \n", "... ... ... ... \n", "68867 NaN NaN NaN \n", "68872 NaN NaN NaN \n", "68874 NaN NaN NaN \n", "68876 NaN NaN NaN \n", "68877 NaN NaN NaN \n", "\n", " order_item_product_price \n", "3 NaN \n", "6 NaN \n", "22 NaN \n", "26 NaN \n", "32 NaN \n", "... ... \n", "68867 NaN \n", "68872 NaN \n", "68874 NaN \n", "68876 NaN \n", "68877 NaN \n", "\n", "[11452 rows x 8 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orders_joined[orders_joined['order_item_id'].isna()]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Task 3\n", "Compute Daily Product Revenue using orders.order_date as well as order_items.order_item_product_id and order_items.order_item_order_subtotal considering only COMPLETE and CLOSED orders." ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "orders_considered = orders.query(\"order_status in ('COMPLETE', 'CLOSED')\")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "orders_filtered = orders[orders.order_status.isin([\"COMPLETE\", \"CLOSED\"])]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_dateorder_customer_idorder_statusorder_item_idorder_item_product_idorder_item_quantityorder_item_subtotalorder_item_product_price
12013-07-25 00:00:00.011599CLOSED19571299.98299.98
42013-07-25 00:00:00.08827CLOSED5897249.9824.99
42013-07-25 00:00:00.08827CLOSED63655299.9559.99
42013-07-25 00:00:00.08827CLOSED75023150.0050.00
42013-07-25 00:00:00.08827CLOSED810144199.9249.98
...........................
688802014-07-13 00:00:00.01117COMPLETE17219110731199.99199.99
688802014-07-13 00:00:00.01117COMPLETE17219210145249.9049.98
688802014-07-13 00:00:00.01117COMPLETE17219310143149.9449.98
688832014-07-23 00:00:00.05533COMPLETE17219720811999.991999.99
688832014-07-23 00:00:00.05533COMPLETE1721985023150.0050.00
\n", "

75408 rows × 8 columns

\n", "
" ], "text/plain": [ " order_date order_customer_id order_status order_item_id \\\n", "1 2013-07-25 00:00:00.0 11599 CLOSED 1 \n", "4 2013-07-25 00:00:00.0 8827 CLOSED 5 \n", "4 2013-07-25 00:00:00.0 8827 CLOSED 6 \n", "4 2013-07-25 00:00:00.0 8827 CLOSED 7 \n", "4 2013-07-25 00:00:00.0 8827 CLOSED 8 \n", "... ... ... ... ... \n", "68880 2014-07-13 00:00:00.0 1117 COMPLETE 172191 \n", "68880 2014-07-13 00:00:00.0 1117 COMPLETE 172192 \n", "68880 2014-07-13 00:00:00.0 1117 COMPLETE 172193 \n", "68883 2014-07-23 00:00:00.0 5533 COMPLETE 172197 \n", "68883 2014-07-23 00:00:00.0 5533 COMPLETE 172198 \n", "\n", " order_item_product_id order_item_quantity order_item_subtotal \\\n", "1 957 1 299.98 \n", "4 897 2 49.98 \n", "4 365 5 299.95 \n", "4 502 3 150.00 \n", "4 1014 4 199.92 \n", "... ... ... ... \n", "68880 1073 1 199.99 \n", "68880 1014 5 249.90 \n", "68880 1014 3 149.94 \n", "68883 208 1 1999.99 \n", "68883 502 3 150.00 \n", "\n", " order_item_product_price \n", "1 299.98 \n", "4 24.99 \n", "4 59.99 \n", "4 50.00 \n", "4 49.98 \n", "... ... \n", "68880 199.99 \n", "68880 49.98 \n", "68880 49.98 \n", "68883 1999.99 \n", "68883 50.00 \n", "\n", "[75408 rows x 8 columns]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orders_considered. \\\n", " set_index('order_id'). \\\n", " join(order_items.set_index('order_item_order_id'), how='inner')" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orders_considered. \\\n", " set_index('order_id'). \\\n", " join(order_items.set_index('order_item_order_id'), how='inner'). \\\n", " groupby(['order_date', 'order_item_product_id'])['order_item_subtotal']" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(('2013-07-25 00:00:00.0', 24),\n", " 57762 319.96\n", " Name: order_item_subtotal, dtype: float64),\n", " (('2013-07-25 00:00:00.0', 93),\n", " 17 74.97\n", " Name: order_item_subtotal, dtype: float64),\n", " (('2013-07-25 00:00:00.0', 134),\n", " 12 100.0\n", " Name: order_item_subtotal, dtype: float64),\n", " (('2013-07-25 00:00:00.0', 191),\n", " 12 499.95\n", " 28 399.96\n", " 28 99.99\n", " 61 399.96\n", " 71 499.95\n", " 101 99.99\n", " 57757 499.95\n", " 57764 499.95\n", " 57768 499.95\n", " 57776 99.99\n", " 57776 99.99\n", " 57779 499.95\n", " 57782 199.98\n", " 57788 199.98\n", " 57788 499.95\n", " Name: order_item_subtotal, dtype: float64),\n", " (('2013-07-25 00:00:00.0', 226),\n", " 68691 599.99\n", " Name: order_item_subtotal, dtype: float64),\n", " (('2013-07-25 00:00:00.0', 365),\n", " 4 299.95\n", " 5 299.95\n", " 15 179.97\n", " 17 239.96\n", " 18 119.98\n", " 28 59.99\n", " 37 59.99\n", " 45 59.99\n", " 57 179.97\n", " 57 119.98\n", " 61 119.98\n", " 61 119.98\n", " 71 119.98\n", " 91 299.95\n", " 57756 59.99\n", " 57757 119.98\n", " 57779 299.95\n", " 57781 299.95\n", " 57788 239.96\n", " 67416 59.99\n", " Name: order_item_subtotal, dtype: float64),\n", " (('2013-07-25 00:00:00.0', 403),\n", " 5 129.99\n", " 18 129.99\n", " 24 129.99\n", " 35 129.99\n", " 57 129.99\n", " 88 129.99\n", " 98 129.99\n", " 57754 129.99\n", " 57756 129.99\n", " 57757 129.99\n", " 57762 129.99\n", " 57762 129.99\n", " 57768 129.99\n", " 57788 129.99\n", " 68691 129.99\n", " Name: order_item_subtotal, dtype: float64),\n", " (('2013-07-25 00:00:00.0', 502),\n", " 4 150.0\n", " 12 250.0\n", " 15 50.0\n", " 24 50.0\n", " 24 250.0\n", " 51 50.0\n", " 62 50.0\n", " 67 150.0\n", " 98 100.0\n", " 57757 150.0\n", " 57758 50.0\n", " 57758 100.0\n", " 57764 150.0\n", " 67416 100.0\n", " Name: order_item_subtotal, dtype: float64),\n", " (('2013-07-25 00:00:00.0', 572),\n", " 72 119.97\n", " Name: order_item_subtotal, dtype: float64),\n", " (('2013-07-25 00:00:00.0', 625),\n", " 57764 199.99\n", " Name: order_item_subtotal, dtype: float64)]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(orders_considered. \\\n", " set_index('order_id'). \\\n", " join(order_items.set_index('order_item_order_id'), how='inner'). \\\n", " groupby(['order_date', 'order_item_product_id'])['order_item_subtotal'])[:10]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
revenue
order_dateorder_item_product_id
2013-07-25 00:00:00.024319.96
9374.97
134100.00
1915099.49
226599.99
.........
2014-07-24 00:00:00.092631.98
9575399.64
100410399.48
10143148.74
10734199.79
\n", "

9120 rows × 1 columns

\n", "
" ], "text/plain": [ " revenue\n", "order_date order_item_product_id \n", "2013-07-25 00:00:00.0 24 319.96\n", " 93 74.97\n", " 134 100.00\n", " 191 5099.49\n", " 226 599.99\n", "... ...\n", "2014-07-24 00:00:00.0 926 31.98\n", " 957 5399.64\n", " 1004 10399.48\n", " 1014 3148.74\n", " 1073 4199.79\n", "\n", "[9120 rows x 1 columns]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orders_considered. \\\n", " set_index('order_id'). \\\n", " join(order_items.set_index('order_item_order_id'), how='inner'). \\\n", " groupby(['order_date', 'order_item_product_id'])['order_item_subtotal']. \\\n", " agg(['sum']). \\\n", " rename(columns={'sum': 'revenue'})" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_dateorder_item_product_idrevenue
02013-07-25 00:00:00.024319.96
12013-07-25 00:00:00.09374.97
22013-07-25 00:00:00.0134100.00
32013-07-25 00:00:00.01915099.49
42013-07-25 00:00:00.0226599.99
............
91152014-07-24 00:00:00.092631.98
91162014-07-24 00:00:00.09575399.64
91172014-07-24 00:00:00.0100410399.48
91182014-07-24 00:00:00.010143148.74
91192014-07-24 00:00:00.010734199.79
\n", "

9120 rows × 3 columns

\n", "
" ], "text/plain": [ " order_date order_item_product_id revenue\n", "0 2013-07-25 00:00:00.0 24 319.96\n", "1 2013-07-25 00:00:00.0 93 74.97\n", "2 2013-07-25 00:00:00.0 134 100.00\n", "3 2013-07-25 00:00:00.0 191 5099.49\n", "4 2013-07-25 00:00:00.0 226 599.99\n", "... ... ... ...\n", "9115 2014-07-24 00:00:00.0 926 31.98\n", "9116 2014-07-24 00:00:00.0 957 5399.64\n", "9117 2014-07-24 00:00:00.0 1004 10399.48\n", "9118 2014-07-24 00:00:00.0 1014 3148.74\n", "9119 2014-07-24 00:00:00.0 1073 4199.79\n", "\n", "[9120 rows x 3 columns]" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orders_considered. \\\n", " set_index('order_id'). \\\n", " join(order_items.set_index('order_item_order_id'), how='inner'). \\\n", " groupby(['order_date', 'order_item_product_id'])['order_item_subtotal']. \\\n", " agg(['sum']). \\\n", " rename(columns={'sum': 'revenue'}). \\\n", " reset_index()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.12" } }, "nbformat": 4, "nbformat_minor": 4 }