import pandas as pd
import numpy as np


# Let's create a simple Series
s = pd.Series(np.random.randint(5, size=(4)))
# фильтрация Series
s[ s >= 3 ]
# if we want to filter all values by function, then we use the lambda function
s.loc[lambda x: x>= 3]

0    4
2    3
3    4
dtype: int32


# you can set up non-numeric indexes for Series
months = ['jan', 'feb', 'mar', 'apr']
s = pd.Series(np.random.randint(8, size=(4)), index=months)
s

jan    6
feb    6
mar    5
apr    3
dtype: int32


months = ['jan', 'feb', 'mar', 'apr']
sales = {
    'revenue':     [100, 200, 300, 400],
    'items_sold':  [23, 43, 55, 65],
    'new_clients': [10, 20, 30, 40]
    }

# Let's create a simple DataFrame with custon index column
data = pd.DataFrame(sales, index=months)
data


months = ['jan', 'feb', 'mar', 'apr']

# Sometimes, some records are missing data. For example, look at the goods_sold list
# it contains sales divided by product categories. In the first month, we sold cars,
# computers, and software. In the second, there are no cars, but bicycles appeared,
# and in the third, cars reappeared, but the bicycles disappeared.
goods_sold = [
    {'computers': 10, 'cars': 1, 'soft': 3, 'bicycles': 1},
    {'computers': 4, 'soft': 5, 'bicycles': 1},
    {'computers': 6, 'cars': 2, 'soft': 3},
    {'computers': 6, 'cars': 2, 'soft': 3}
    ]

data2 = pd.DataFrame(goods_sold, index=months)

# Note that bicycles sales in april and march are NaN - stands for Not a Number.
data2.sample(4)


# You can clear the entire dataframe from NaN values
data5 = data2.dropna()
data5


# if you need to selectively clear the dataframe from NaN values, then there are 2 methods: pd.notnull(data2['cars']) and .query("cars.notnull()")

# 1 method
data2.loc[pd.notnull(data2['cars'])]

# 2 method
data2.query('cars.notnull() and bicycles.notnull()')


# Now let's see how to load data from files. Most often, data is stored in Excel spreadsheets or csv files.
data3 = pd.read_csv("world_data.csv")

# Explore the downloaded data.
# The four attributes that any dataframe has are .shape, .columns, .index, and .dtypes.

# .shape shows how many rows and columns are in the dataframe.
# .columns shows the names of the columns.
# .dtypes we learn data types.
# .index shows line names.
data3.index, data3.columns, data3.dtypes, data3.shape

(RangeIndex(start=0, stop=625, step=1),
 Index(['Overall rank', 'Country', 'Region', 'Score', 'GDP per capita',
        'Social support', 'Healthy life expectancy',
        'Freedom to make life choices', 'Generosity',
        'Perceptions of corruption', 'year'],
       dtype='object'),
 Overall rank                      int64
 Country                          object
 Region                           object
 Score                           float64
 GDP per capita                  float64
 Social support                  float64
 Healthy life expectancy         float64
 Freedom to make life choices     object
 Generosity                      float64
 Perceptions of corruption       float64
 year                              int64
 dtype: object,
 (625, 11))


# To find out the spread of values, the average cost and the median, use the .describe() method
data = data3
data.describe()


# Returns DataFrame with a single column.
z1 = data.loc[[1, 2], ['Country']] 

# Returns a Series object, which is a one-dimensional labeled array.
# This is because 'Country' was not passed as a single argument (not list) to the loc function.
z2 = data.loc[[1, 2], 'Country']


# If we type like this, we will output a Series object with 1 column. If we add one more square brackets, then we'll get the dataframe.
a = data['Score']

# If we need to display several columns, we need to insert a list with their names in square brackets.
# The first square brackets are from the dataframe, the second ones are from the list.
b = data[['Score','GDP per capita']] 
b.sample(5)


d = {"price":[1, 2, 3, 5, 6], "count": [10, 20, 30, 40, 50], "percent": [24, 51, 71, 25, 42]}
a = pd.DataFrame(d)

# Now see how to add columns by conditions.
# There is 2 types of adding columns by conditions, using apply() or np.where
a['discount'] = a['price'].apply(lambda x: 5 if x>2 else 3)
a['Good'] = np.where(a['price']>2, 'yes', 'no')
a


# Filtering by condition. We must remember that each condition in round brackets and then they are together in a tuple
a.loc[((a['discount'] == 3) | (a['percent'] == 71))]


# 2 identical results of filtering, but loc accesses by key (KeyError) and iloc accesses by index (IndexError)

# df.loc[row_indexer, column_indexer]
a.loc[1, ['percent']]

# iloc is based on using integers to access data
a.iloc[1, 2]

51


# you can select rows by condition by assigning it to a variable
big_count = a['count'] > 20
a.loc[big_count]


# you can use both and, or and &, |
# enclose string values IN DOUBLE BRACKETS, the expression in a SINGLE
a.query('count > 5 & Good == "no"')
a.query('price != 6 and (Good == "yes" or percent > 50)')


# values for filters can be stored in a variable and referenced in a query using the @ symbol
count_val = [30, 40]
no = "no"
a.query('count == @count_val & Good != @no')


data = pd.read_csv('orders.csv')
data.head(3)


# change date format 'order_date' from 'object' to 'datetime64'
data['order_date'] = pd.to_datetime(data['order_date'])
data.head(3)


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5009 entries, 0 to 5008
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           5009 non-null   int64         
 1   order_date   5009 non-null   datetime64[ns]
 2   ship_mode    5009 non-null   object        
 3   customer_id  5009 non-null   object        
 4   sales        5009 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 195.8+ KB


# calculate the total sales
sum_all = data['sales'].sum()

# now let's group the data by the ship_mode parameter using the .groupby() method and then calculate sales by ship_mode
sum_by_ship_mode = data.groupby('ship_mode')['sales'].sum()

sum_all, sum_by_ship_mode

(2297200.8603000003,
 ship_mode
 First        351,428.4
 Same Day     128,363.1
 Second       459,193.6
 Standard   1,358,215.7
 Name: sales, dtype: float64)


pd.options.display.float_format = '{:,.1f}'.format

sum_by_ship_mode

ship_mode
First        351,428.4
Same Day     128,363.1
Second       459,193.6
Standard   1,358,215.7
Name: sales, dtype: float64


# add grouping by dates, pass columns for grouping as a list
sum_by_ship_mode_date = data.groupby(['ship_mode', 'order_date'])['sales'].sum()
sum_by_ship_mode_date.head()

ship_mode  order_date
First      2014-01-06    12.8
           2014-01-11     9.9
           2014-01-14    62.0
           2014-01-15   149.9
           2014-01-19   378.6
Name: sales, dtype: float64


# To show all distinct values of a column in pandas, use the unique() function.
data['ship_mode'].unique()

array(['Standard', 'Second', 'First', 'Same Day'], dtype=object)


# Add to the sample the number and the mean value of orders.
#  To do this, instead of .sum(), we use the .agg() method. We pass a list with the names of the necessary methods into .agg().
agg_by_ship_mode_date = data.groupby(['ship_mode', 'order_date'])['sales'].agg(['sum', 'count', 'mean'])
agg_by_ship_mode_date.head()


# we see a large spread in the average bill, so we sort orders by the most profitable days
agg_and_ord = (
    data.groupby(['ship_mode', 'order_date'])['sales']
    .agg(['sum', 'count', 'mean'])
    .sort_values(by='sum', ascending=False)
)

agg_and_ord.head(10)


customers = pd.read_csv('customers.csv')
customers.sample(2)


# add filtering by customer_id and get some customer data
id = 'BT-11530'
data.query('customer_id == @id')[['order_date', 'sales']]


# calculate the sum of his orders
data.query('customer_id == @id')['sales'].sum()

2684.4919999999997


States = ['Tennessee', 'Ohio']

# Filter customers by state
filtered_customers = customers.query('state in @States')

# Filter orders by filtered_customer_id
filtered_customers_id = filtered_customers['id']
filtered_data = data.query('customer_id in @filtered_customers_id')

# Группируем по customer_id и вычисляем sum of sales.
# Чтобы создать из этого датафрейм со значениями customer_id и sum, нужно добавить .reset_index().
# Иначе результирующий объект будет Series с «customer_id» в качестве индекса и суммой продаж в качестве значений.
# We group data by customer_id and calculate the sum of sales.
# To create a dataframe from this with the customer_id and sum columns, we need to add .reset_index().
# Otherwise the resulting object will be a Series with "customer_id" as index and the Sum of sales as values.
grouped_data = filtered_data.groupby('customer_id')['sales'].sum().reset_index()

# Rename column
grouped_data = grouped_data.rename(columns={'sales': 'sum'})

# Sort in descending order by customer_id
grouped_data.sort_values(by='customer_id').head()


# rename the id column in customers to customer_id to avoid duplication of columns
customers = customers.rename(columns={'id': 'customer_id'})
# join the tables by customer_id via inner join
new_df = pd.merge(data, customers, how='inner', on='customer_id')
new_df.head(3)


# Find 5 cities with the largest revenue in 2016.
# Since we have already converted the date column to datetime, we filter the data by year.
# The Pandas Series.dt.year attribute returns a numpy array containing the year.
filtered_df = new_df[new_df['order_date'].dt.year == 2016]

# Now display the result: group by city, sum sales, sort in descending order,
# convert to a dataframe, reset the index and set the ordinal numbering, and then display the top 5 cities.
filtered_df.groupby('city')['sales'].sum().sort_values(ascending=False).reset_index().head()


# see what years are presented in the table
data['order_date'].dt.year.unique()
# Output: array([2014, 2015, 2016, 2017], dtype=int64)

# We only have data for 4 years, so filtering will only occur by shipping class
Delivery_class = 'First'
data.query('ship_mode == @Delivery_class')['id'].count()

787


# Attach the state column to the orders table
df_with_state = pd.merge(data, customers[['customer_id', 'state']], how='left', on='customer_id')
df_with_state.head()

# Let's count the total number of rows with the entry "California", which means all customer_id values, including duplicate ones.
California_clients = df_with_state.query('state == "California"')['customer_id'].count()
# But the nunique() function returns the number of unique values in the specified column.
California_clients = df_with_state.query('state == "California"')['customer_id'].nunique()

# How many orders did they make?
orders_count = df_with_state.query('state == "California"')['id'].count()

# Print results
print(f'{California_clients} clients from California made {orders_count} orders.')

161 clients from California made 1006 orders.


# We will use the .agg() method, firstly, it will count all the necessary metrics,
# and secondly, it returns a DataFrame, unlike .sum()
summary_table_by_year = df_with_state.groupby(['state', df_with_state['order_date'].dt.year])['sales'].agg(['sum', 'count', 'mean'])
summary_table_by_year.head(10)


# Let's calculate the total revenue and the number of orders for each customer
data.groupby('customer_id')['sales'].agg(['sum', 'count']).head()


# To calculate the date of the first purchase of each user, group the data by customer_id
# and find the min value of the order_date field. The result will save in the first_orders variable 

# In these lines we get same result, but we need to rename the min column to first_order
first_orders = data.groupby('customer_id')['order_date'].agg(['min'])
first_orders = data.groupby('customer_id')['order_date'].min().rename('first_order').reset_index()

first_orders.head()


# First, we add the date of the first purchase by users to the table with orders.
orders_merged = data.merge(first_orders, how='inner', on='customer_id')

orders_merged.head()


# Aggregate by the first_order date and by order_date and calculate the necessary indicators:
orders_cohorts = orders_merged.groupby(['first_order', 'order_date'])['sales'].agg(['sum','count'])
orders_cohorts.head(12)


# Create a filter with condition(boolean table)
year_1_filter = orders_merged['order_date'] - orders_merged['first_order'] <= '365 days'
year_1_filter

0        True
1       False
2       False
3       False
4       False
        ...  
5004     True
5005     True
5006    False
5007     True
5008    False
Length: 5009, dtype: bool


# Filter out unnecessary orders from the cohort table and store the result in the year_1_orders table.
# Just add the filter to brackets instead of a condition
year_1_orders = orders_merged[year_1_filter]
year_1_orders.head()


# Now let's group the orders by the date of the first purchase and order_date and calculate the 'sum' and 'count' metrics.
cohorts = year_1_orders.groupby(['first_order','order_date'])['sales'].agg(['sum', 'count'])
cohorts.head()


# Last step: calculate how many orders customers bring in on average during the first year.
# First sum the indicators of each cohort, and then average the values using the mean() method.
# Since we did not specify which particular column to calculate the average,
# he calculated the average amount and the average number of orders.
cohorts.groupby(['first_order']).sum().mean()

sum     1,949.9
count       4.0
dtype: float64

	Overall rank	Score	GDP per capita	Social support	Healthy life expectancy	Generosity	Perceptions of corruption	year
count	625.00000	617.000000	617.000000	621.000000	616.000000	621.000000	617.000000	625.000000
mean	78.37280	5.539538	1.092853	1.258810	0.769447	0.373890	0.119883	2017.112000
std	45.14166	3.971712	4.009105	3.983228	4.012307	4.006105	0.101345	9.724586
min	1.00000	2.693000	0.000000	0.000000	0.000000	0.000000	0.000000	1776.000000
25%	39.00000	4.500000	0.631070	0.871140	0.440765	0.127060	0.053000	2016.000000
50%	78.00000	5.358000	0.989000	1.154000	0.640590	0.199970	0.088000	2017.000000
75%	117.00000	6.192000	1.260749	1.390000	0.808000	0.271000	0.149750	2018.000000
max	157.00000	100.000000	100.000000	100.000000	100.000000	100.000000	0.505210	2019.000000

	Score	GDP per capita
534	5.615	1.066880
589	4.356	0.522670
409	5.011	0.885416
277	4.424	0.314000
359	5.971	0.786441

	id	order_date	ship_mode	customer_id	sales
0	100006	2014-09-07	Standard	DK-13375	378.0
1	100090	2014-07-08	Standard	EB-13705	699.2
2	100293	2014-03-14	Standard	NF-18475	91.1

	id	order_date	ship_mode	customer_id	sales
0	100006	2014-09-07	Standard	DK-13375	378.0
1	100090	2014-07-08	Standard	EB-13705	699.2
2	100293	2014-03-14	Standard	NF-18475	91.1

		sum	count	mean
ship_mode	order_date
Standard	2014-03-18	26,908.4	2	13,454.2
Standard	2016-10-02	18,398.2	2	9,199.1
First	2017-03-23	14,299.1	3	4,766.4
Standard	2014-09-08	14,060.4	9	1,562.3
First	2017-10-22	13,716.5	1	13,716.5
Standard	2016-12-17	12,185.1	5	2,437.0
	2017-11-17	12,112.5	8	1,514.1
	2015-09-17	11,467.6	5	2,293.5
	2016-05-23	10,561.0	4	2,640.2
	2014-09-23	10,478.6	5	2,095.7

Pandas cheat sheet.¶

Query Language¶

Calculating derived metrics¶

Filtering the sum of orders by States and Customers¶

How many orders shipped first class in the last 5 years?¶

How many clients are from California?¶

Build a summary table of average checks, count and sum of orders for all states by year.¶

Cohort data analysis¶

Calculate the date of the first purchase¶

Building cohorts¶

The first order of the first cohort was on January 3 for $16. The next time a customer came back almost a year later, with $153 purchase.¶

The cohorts are ready, now let's calculate the numbers for the first year of the cohort's life.¶

	id	name	segment	state	city
137	DJ-13510	Don Jones	Corporate	Tennessee	Murfreesboro
159	NG-18430	Nathan Gelder	Consumer	Ohio	Cincinnati

	order_date	sales
69	2014-05-21	31.8
560	2014-12-22	216.4
2445	2016-01-22	165.6
3131	2017-02-26	2,110.7
3271	2017-09-30	160.0

	customer_id	sum
0	AB-10255	914.5
1	AG-10390	200.9
2	BE-11410	2,261.4
3	BG-11695	1,874.2
4	BM-11785	1,338.8

	city	sales
0	New York City	53,094.1
1	Philadelphia	39,895.5
2	Seattle	33,955.5
3	Los Angeles	33,611.1
4	San Francisco	27,990.0

		sum	count	mean
state	order_date
Alabama	2014	3,829.7	9	425.5
	2015	6,016.4	12	501.4
	2016	13,669.6	16	854.3
	2017	7,523.3	18	418.0
Arizona	2014	11,403.9	24	475.2
	2015	21,801.6	31	703.3
	2016	23,069.1	33	699.1
	2017	25,711.6	48	535.7
Arkansas	2014	3,294.0	3	1,098.0
Arkansas	2015	111.7	2	55.8

	sum	count
customer_id
AA-10315	5,563.6	5
AA-10375	1,056.4	9
AA-10480	1,790.5	4
AA-10645	5,086.9	6
AB-10015	886.2	3

	customer_id	first_order
0	AA-10315	2014-03-31
1	AA-10375	2014-04-21
2	AA-10480	2014-05-04
3	AA-10645	2014-06-22
4	AB-10015	2014-02-18

		sum	count
first_order	order_date
2014-01-03	2014-01-03	16.4	1
	2014-11-12	153.1	1
	2015-04-18	209.6	1
	2015-11-24	383.6	1
	2016-05-15	7.8	1
	2016-11-14	37.6	1
	2017-03-04	89.6	1
	2017-05-14	88.0	1
	2017-08-26	65.0	1
2014-01-04	2014-01-04	288.1	1
	2015-09-25	183.4	1
	2015-11-20	344.4	1

	revenue	items_sold	new_clients
jan	100	23	10
feb	200	43	20
mar	300	55	30
apr	400	65	40

	price	count	percent	discount	Good
0	1	10	24	3	no
1	2	20	51	3	no
2	3	30	71	5	yes
3	5	40	25	5	yes
4	6	50	42	5	yes

	price	count	percent	discount	Good
0	1	10	24	3	no
1	2	20	51	3	no
2	3	30	71	5	yes

	price	count	percent	discount	Good
1	2	20	51	3	no
2	3	30	71	5	yes
3	5	40	25	5	yes