import pandas as pd
import numpy as np
A Pandas Series is like a column in a table. It is a one-dimensional array holding data of any type.
# Let's create a simple Series
s = pd.Series(np.random.randint(5, size=(4)))
# фильтрация Series
s[ s >= 3 ]
# if we want to filter all values by function, then we use the lambda function
s.loc[lambda x: x>= 3]
0 4 2 3 3 4 dtype: int32
# you can set up non-numeric indexes for Series
months = ['jan', 'feb', 'mar', 'apr']
s = pd.Series(np.random.randint(8, size=(4)), index=months)
s
jan 6 feb 6 mar 5 apr 3 dtype: int32
A pandas DataFrame can be thought of as an empty array with column labels and additional data types. Each column in the DataFrame is represented as a Series object, which is a one-dimensional array with an index labeling each element. An index can be either numeric or non-numeric (such as dates, strings, or categorical variables). Rows in a DataFrame are also indexed, usually with integers starting at 0. To access elements in a DataFrame, you can use the loc accessor, which allows you to select rows and columns based on their labels. The syntax of loc is slightly different from normal indexing in numpy or Python, as it uses square brackets and accepts boolean expressions as arguments.
months = ['jan', 'feb', 'mar', 'apr']
sales = {
'revenue': [100, 200, 300, 400],
'items_sold': [23, 43, 55, 65],
'new_clients': [10, 20, 30, 40]
}
# Let's create a simple DataFrame with custon index column
data = pd.DataFrame(sales, index=months)
data
revenue | items_sold | new_clients | |
---|---|---|---|
jan | 100 | 23 | 10 |
feb | 200 | 43 | 20 |
mar | 300 | 55 | 30 |
apr | 400 | 65 | 40 |
months = ['jan', 'feb', 'mar', 'apr']
# Sometimes, some records are missing data. For example, look at the goods_sold list
# it contains sales divided by product categories. In the first month, we sold cars,
# computers, and software. In the second, there are no cars, but bicycles appeared,
# and in the third, cars reappeared, but the bicycles disappeared.
goods_sold = [
{'computers': 10, 'cars': 1, 'soft': 3, 'bicycles': 1},
{'computers': 4, 'soft': 5, 'bicycles': 1},
{'computers': 6, 'cars': 2, 'soft': 3},
{'computers': 6, 'cars': 2, 'soft': 3}
]
data2 = pd.DataFrame(goods_sold, index=months)
# Note that bicycles sales in april and march are NaN - stands for Not a Number.
data2.sample(4)
computers | cars | soft | bicycles | |
---|---|---|---|---|
feb | 4 | NaN | 5 | 1.0 |
jan | 10 | 1.0 | 3 | 1.0 |
apr | 6 | 2.0 | 3 | NaN |
mar | 6 | 2.0 | 3 | NaN |
# You can clear the entire dataframe from NaN values
data5 = data2.dropna()
data5
computers | cars | soft | bicycles | |
---|---|---|---|---|
jan | 10 | 1.0 | 3 | 1.0 |
# if you need to selectively clear the dataframe from NaN values, then there are 2 methods: pd.notnull(data2['cars']) and .query("cars.notnull()")
# 1 method
data2.loc[pd.notnull(data2['cars'])]
# 2 method
data2.query('cars.notnull() and bicycles.notnull()')
computers | cars | soft | bicycles | |
---|---|---|---|---|
jan | 10 | 1.0 | 3 | 1.0 |
# Now let's see how to load data from files. Most often, data is stored in Excel spreadsheets or csv files.
data3 = pd.read_csv("world_data.csv")
# Explore the downloaded data.
# The four attributes that any dataframe has are .shape, .columns, .index, and .dtypes.
# .shape shows how many rows and columns are in the dataframe.
# .columns shows the names of the columns.
# .dtypes we learn data types.
# .index shows line names.
data3.index, data3.columns, data3.dtypes, data3.shape
(RangeIndex(start=0, stop=625, step=1), Index(['Overall rank', 'Country', 'Region', 'Score', 'GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'year'], dtype='object'), Overall rank int64 Country object Region object Score float64 GDP per capita float64 Social support float64 Healthy life expectancy float64 Freedom to make life choices object Generosity float64 Perceptions of corruption float64 year int64 dtype: object, (625, 11))
# To find out the spread of values, the average cost and the median, use the .describe() method
data = data3
data.describe()
Overall rank | Score | GDP per capita | Social support | Healthy life expectancy | Generosity | Perceptions of corruption | year | |
---|---|---|---|---|---|---|---|---|
count | 625.00000 | 617.000000 | 617.000000 | 621.000000 | 616.000000 | 621.000000 | 617.000000 | 625.000000 |
mean | 78.37280 | 5.539538 | 1.092853 | 1.258810 | 0.769447 | 0.373890 | 0.119883 | 2017.112000 |
std | 45.14166 | 3.971712 | 4.009105 | 3.983228 | 4.012307 | 4.006105 | 0.101345 | 9.724586 |
min | 1.00000 | 2.693000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1776.000000 |
25% | 39.00000 | 4.500000 | 0.631070 | 0.871140 | 0.440765 | 0.127060 | 0.053000 | 2016.000000 |
50% | 78.00000 | 5.358000 | 0.989000 | 1.154000 | 0.640590 | 0.199970 | 0.088000 | 2017.000000 |
75% | 117.00000 | 6.192000 | 1.260749 | 1.390000 | 0.808000 | 0.271000 | 0.149750 | 2018.000000 |
max | 157.00000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 0.505210 | 2019.000000 |
# Returns DataFrame with a single column.
z1 = data.loc[[1, 2], ['Country']]
# Returns a Series object, which is a one-dimensional labeled array.
# This is because 'Country' was not passed as a single argument (not list) to the loc function.
z2 = data.loc[[1, 2], 'Country']
# If we type like this, we will output a Series object with 1 column. If we add one more square brackets, then we'll get the dataframe.
a = data['Score']
# If we need to display several columns, we need to insert a list with their names in square brackets.
# The first square brackets are from the dataframe, the second ones are from the list.
b = data[['Score','GDP per capita']]
b.sample(5)
Score | GDP per capita | |
---|---|---|
534 | 5.615 | 1.066880 |
589 | 4.356 | 0.522670 |
409 | 5.011 | 0.885416 |
277 | 4.424 | 0.314000 |
359 | 5.971 | 0.786441 |
d = {"price":[1, 2, 3, 5, 6], "count": [10, 20, 30, 40, 50], "percent": [24, 51, 71, 25, 42]}
a = pd.DataFrame(d)
# Now see how to add columns by conditions.
# There is 2 types of adding columns by conditions, using apply() or np.where
a['discount'] = a['price'].apply(lambda x: 5 if x>2 else 3)
a['Good'] = np.where(a['price']>2, 'yes', 'no')
a
price | count | percent | discount | Good | |
---|---|---|---|---|---|
0 | 1 | 10 | 24 | 3 | no |
1 | 2 | 20 | 51 | 3 | no |
2 | 3 | 30 | 71 | 5 | yes |
3 | 5 | 40 | 25 | 5 | yes |
4 | 6 | 50 | 42 | 5 | yes |
# Filtering by condition. We must remember that each condition in round brackets and then they are together in a tuple
a.loc[((a['discount'] == 3) | (a['percent'] == 71))]
price | count | percent | discount | Good | |
---|---|---|---|---|---|
0 | 1 | 10 | 24 | 3 | no |
1 | 2 | 20 | 51 | 3 | no |
2 | 3 | 30 | 71 | 5 | yes |
# 2 identical results of filtering, but loc accesses by key (KeyError) and iloc accesses by index (IndexError)
# df.loc[row_indexer, column_indexer]
a.loc[1, ['percent']]
# iloc is based on using integers to access data
a.iloc[1, 2]
51
# you can select rows by condition by assigning it to a variable
big_count = a['count'] > 20
a.loc[big_count]
In Pandas, the NOT operator represented by the ~ symbol. It can be used in boolean indexers to invert the condition.
a.loc[~(a['count'] > 20)]
The "not equal" operator in Pandas is represented by the != symbol. You can use this operator in boolean indexers to select rows where the value in the column is not equal to a certain value.
a.loc[a['column_name'] != 10]
Another way to filter data is to use a query language. All conditions are written in one line 'sales > 1000 & ship_mode == 'First' and pass it to the .query() method. The request is more readable.
# you can use both and, or and &, |
# enclose string values IN DOUBLE BRACKETS, the expression in a SINGLE
a.query('count > 5 & Good == "no"')
a.query('price != 6 and (Good == "yes" or percent > 50)')
price | count | percent | discount | Good | |
---|---|---|---|---|---|
1 | 2 | 20 | 51 | 3 | no |
2 | 3 | 30 | 71 | 5 | yes |
3 | 5 | 40 | 25 | 5 | yes |
# values for filters can be stored in a variable and referenced in a query using the @ symbol
count_val = [30, 40]
no = "no"
a.query('count == @count_val & Good != @no')
price | count | percent | discount | Good | |
---|---|---|---|---|---|
2 | 3 | 30 | 71 | 5 | yes |
3 | 5 | 40 | 25 | 5 | yes |
number of orders, total revenue, average check, conversion
data = pd.read_csv('orders.csv')
data.head(3)
id | order_date | ship_mode | customer_id | sales | |
---|---|---|---|---|---|
0 | 100006 | 2014-09-07 | Standard | DK-13375 | 378.0 |
1 | 100090 | 2014-07-08 | Standard | EB-13705 | 699.2 |
2 | 100293 | 2014-03-14 | Standard | NF-18475 | 91.1 |
# change date format 'order_date' from 'object' to 'datetime64'
data['order_date'] = pd.to_datetime(data['order_date'])
data.head(3)
id | order_date | ship_mode | customer_id | sales | |
---|---|---|---|---|---|
0 | 100006 | 2014-09-07 | Standard | DK-13375 | 378.0 |
1 | 100090 | 2014-07-08 | Standard | EB-13705 | 699.2 |
2 | 100293 | 2014-03-14 | Standard | NF-18475 | 91.1 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5009 entries, 0 to 5008 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 5009 non-null int64 1 order_date 5009 non-null datetime64[ns] 2 ship_mode 5009 non-null object 3 customer_id 5009 non-null object 4 sales 5009 non-null float64 dtypes: datetime64[ns](1), float64(1), int64(1), object(2) memory usage: 195.8+ KB
# calculate the total sales
sum_all = data['sales'].sum()
# now let's group the data by the ship_mode parameter using the .groupby() method and then calculate sales by ship_mode
sum_by_ship_mode = data.groupby('ship_mode')['sales'].sum()
sum_all, sum_by_ship_mode
(2297200.8603000003, ship_mode First 351,428.4 Same Day 128,363.1 Second 459,193.6 Standard 1,358,215.7 Name: sales, dtype: float64)
If we are working with floats, rounding may be needed: 3.514284e+05 is a scientific format for displaying numbers. Means 3.51 * 10^5. We don't need that kind of precision, so we can tell Pandas to round values to the nearest hundredth:
pd.options.display.float_format = '{:,.1f}'.format
pd.options.display.float_format = '{:,.1f}'.format
sum_by_ship_mode
ship_mode First 351,428.4 Same Day 128,363.1 Second 459,193.6 Standard 1,358,215.7 Name: sales, dtype: float64
# add grouping by dates, pass columns for grouping as a list
sum_by_ship_mode_date = data.groupby(['ship_mode', 'order_date'])['sales'].sum()
sum_by_ship_mode_date.head()
ship_mode order_date First 2014-01-06 12.8 2014-01-11 9.9 2014-01-14 62.0 2014-01-15 149.9 2014-01-19 378.6 Name: sales, dtype: float64
# To show all distinct values of a column in pandas, use the unique() function.
data['ship_mode'].unique()
array(['Standard', 'Second', 'First', 'Same Day'], dtype=object)
# Add to the sample the number and the mean value of orders.
# To do this, instead of .sum(), we use the .agg() method. We pass a list with the names of the necessary methods into .agg().
agg_by_ship_mode_date = data.groupby(['ship_mode', 'order_date'])['sales'].agg(['sum', 'count', 'mean'])
agg_by_ship_mode_date.head()
sum | count | mean | ||
---|---|---|---|---|
ship_mode | order_date | |||
First | 2014-01-06 | 12.8 | 1 | 12.8 |
2014-01-11 | 9.9 | 1 | 9.9 | |
2014-01-14 | 62.0 | 1 | 62.0 | |
2014-01-15 | 149.9 | 1 | 149.9 | |
2014-01-19 | 378.6 | 1 | 378.6 |
# we see a large spread in the average bill, so we sort orders by the most profitable days
agg_and_ord = (
data.groupby(['ship_mode', 'order_date'])['sales']
.agg(['sum', 'count', 'mean'])
.sort_values(by='sum', ascending=False)
)
agg_and_ord.head(10)
sum | count | mean | ||
---|---|---|---|---|
ship_mode | order_date | |||
Standard | 2014-03-18 | 26,908.4 | 2 | 13,454.2 |
2016-10-02 | 18,398.2 | 2 | 9,199.1 | |
First | 2017-03-23 | 14,299.1 | 3 | 4,766.4 |
Standard | 2014-09-08 | 14,060.4 | 9 | 1,562.3 |
First | 2017-10-22 | 13,716.5 | 1 | 13,716.5 |
Standard | 2016-12-17 | 12,185.1 | 5 | 2,437.0 |
2017-11-17 | 12,112.5 | 8 | 1,514.1 | |
2015-09-17 | 11,467.6 | 5 | 2,293.5 | |
2016-05-23 | 10,561.0 | 4 | 2,640.2 | |
2014-09-23 | 10,478.6 | 5 | 2,095.7 |
Where are the clients from? To find out, wee need to combine order data with customer data. Let's load them into the customers variable and see who they are.
customers = pd.read_csv('customers.csv')
customers.sample(2)
id | name | segment | state | city | |
---|---|---|---|---|---|
137 | DJ-13510 | Don Jones | Corporate | Tennessee | Murfreesboro |
159 | NG-18430 | Nathan Gelder | Consumer | Ohio | Cincinnati |
# add filtering by customer_id and get some customer data
id = 'BT-11530'
data.query('customer_id == @id')[['order_date', 'sales']]
order_date | sales | |
---|---|---|
69 | 2014-05-21 | 31.8 |
560 | 2014-12-22 | 216.4 |
2445 | 2016-01-22 | 165.6 |
3131 | 2017-02-26 | 2,110.7 |
3271 | 2017-09-30 | 160.0 |
# calculate the sum of his orders
data.query('customer_id == @id')['sales'].sum()
2684.4919999999997
States = ['Tennessee', 'Ohio']
# Filter customers by state
filtered_customers = customers.query('state in @States')
# Filter orders by filtered_customer_id
filtered_customers_id = filtered_customers['id']
filtered_data = data.query('customer_id in @filtered_customers_id')
# Группируем по customer_id и вычисляем sum of sales.
# Чтобы создать из этого датафрейм со значениями customer_id и sum, нужно добавить .reset_index().
# Иначе результирующий объект будет Series с «customer_id» в качестве индекса и суммой продаж в качестве значений.
# We group data by customer_id and calculate the sum of sales.
# To create a dataframe from this with the customer_id and sum columns, we need to add .reset_index().
# Otherwise the resulting object will be a Series with "customer_id" as index and the Sum of sales as values.
grouped_data = filtered_data.groupby('customer_id')['sales'].sum().reset_index()
# Rename column
grouped_data = grouped_data.rename(columns={'sales': 'sum'})
# Sort in descending order by customer_id
grouped_data.sort_values(by='customer_id').head()
customer_id | sum | |
---|---|---|
0 | AB-10255 | 914.5 |
1 | AG-10390 | 200.9 |
2 | BE-11410 | 2,261.4 |
3 | BG-11695 | 1,874.2 |
4 | BM-11785 | 1,338.8 |
Merge tables with customers and orders. Dataframes are joining using the .concat(), .merge() and .join() methods. They all do the same thing, but differ in syntax.
# rename the id column in customers to customer_id to avoid duplication of columns
customers = customers.rename(columns={'id': 'customer_id'})
# join the tables by customer_id via inner join
new_df = pd.merge(data, customers, how='inner', on='customer_id')
new_df.head(3)
id | order_date | ship_mode | customer_id | sales | name | segment | state | city | |
---|---|---|---|---|---|---|---|---|---|
0 | 100006 | 2014-09-07 | Standard | DK-13375 | 378.0 | Dennis Kane | Consumer | Ohio | Marion |
1 | 131884 | 2015-12-06 | Same Day | DK-13375 | 594.0 | Dennis Kane | Consumer | Ohio | Marion |
2 | 145065 | 2015-12-12 | First | DK-13375 | 32.3 | Dennis Kane | Consumer | Ohio | Marion |
# Find 5 cities with the largest revenue in 2016.
# Since we have already converted the date column to datetime, we filter the data by year.
# The Pandas Series.dt.year attribute returns a numpy array containing the year.
filtered_df = new_df[new_df['order_date'].dt.year == 2016]
# Now display the result: group by city, sum sales, sort in descending order,
# convert to a dataframe, reset the index and set the ordinal numbering, and then display the top 5 cities.
filtered_df.groupby('city')['sales'].sum().sort_values(ascending=False).reset_index().head()
city | sales | |
---|---|---|
0 | New York City | 53,094.1 |
1 | Philadelphia | 39,895.5 |
2 | Seattle | 33,955.5 |
3 | Los Angeles | 33,611.1 |
4 | San Francisco | 27,990.0 |
# see what years are presented in the table
data['order_date'].dt.year.unique()
# Output: array([2014, 2015, 2016, 2017], dtype=int64)
# We only have data for 4 years, so filtering will only occur by shipping class
Delivery_class = 'First'
data.query('ship_mode == @Delivery_class')['id'].count()
787
# Attach the state column to the orders table
df_with_state = pd.merge(data, customers[['customer_id', 'state']], how='left', on='customer_id')
df_with_state.head()
# Let's count the total number of rows with the entry "California", which means all customer_id values, including duplicate ones.
California_clients = df_with_state.query('state == "California"')['customer_id'].count()
# But the nunique() function returns the number of unique values in the specified column.
California_clients = df_with_state.query('state == "California"')['customer_id'].nunique()
# How many orders did they make?
orders_count = df_with_state.query('state == "California"')['id'].count()
# Print results
print(f'{California_clients} clients from California made {orders_count} orders.')
161 clients from California made 1006 orders.
# We will use the .agg() method, firstly, it will count all the necessary metrics,
# and secondly, it returns a DataFrame, unlike .sum()
summary_table_by_year = df_with_state.groupby(['state', df_with_state['order_date'].dt.year])['sales'].agg(['sum', 'count', 'mean'])
summary_table_by_year.head(10)
sum | count | mean | ||
---|---|---|---|---|
state | order_date | |||
Alabama | 2014 | 3,829.7 | 9 | 425.5 |
2015 | 6,016.4 | 12 | 501.4 | |
2016 | 13,669.6 | 16 | 854.3 | |
2017 | 7,523.3 | 18 | 418.0 | |
Arizona | 2014 | 11,403.9 | 24 | 475.2 |
2015 | 21,801.6 | 31 | 703.3 | |
2016 | 23,069.1 | 33 | 699.1 | |
2017 | 25,711.6 | 48 | 535.7 | |
Arkansas | 2014 | 3,294.0 | 3 | 1,098.0 |
2015 | 111.7 | 2 | 55.8 |
# Let's calculate the total revenue and the number of orders for each customer
data.groupby('customer_id')['sales'].agg(['sum', 'count']).head()
sum | count | |
---|---|---|
customer_id | ||
AA-10315 | 5,563.6 | 5 |
AA-10375 | 1,056.4 | 9 |
AA-10480 | 1,790.5 | 4 |
AA-10645 | 5,086.9 | 6 |
AB-10015 | 886.2 | 3 |
# To calculate the date of the first purchase of each user, group the data by customer_id
# and find the min value of the order_date field. The result will save in the first_orders variable
# In these lines we get same result, but we need to rename the min column to first_order
first_orders = data.groupby('customer_id')['order_date'].agg(['min'])
first_orders = data.groupby('customer_id')['order_date'].min().rename('first_order').reset_index()
first_orders.head()
customer_id | first_order | |
---|---|---|
0 | AA-10315 | 2014-03-31 |
1 | AA-10375 | 2014-04-21 |
2 | AA-10480 | 2014-05-04 |
3 | AA-10645 | 2014-06-22 |
4 | AB-10015 | 2014-02-18 |
Cohorts here - people who made their first purchase on a given day. Tracking metrics - number of orders and revenue.
we calculate the revenue and the number of orders for each cohort.
# First, we add the date of the first purchase by users to the table with orders.
orders_merged = data.merge(first_orders, how='inner', on='customer_id')
orders_merged.head()
id | order_date | ship_mode | customer_id | sales | first_order | |
---|---|---|---|---|---|---|
0 | 100006 | 2014-09-07 | Standard | DK-13375 | 378.0 | 2014-09-07 |
1 | 131884 | 2015-12-06 | Same Day | DK-13375 | 594.0 | 2014-09-07 |
2 | 145065 | 2015-12-12 | First | DK-13375 | 32.3 | 2014-09-07 |
3 | 133046 | 2017-07-27 | Second | DK-13375 | 298.0 | 2014-09-07 |
4 | 165099 | 2017-12-11 | First | DK-13375 | 1.4 | 2014-09-07 |
# Aggregate by the first_order date and by order_date and calculate the necessary indicators:
orders_cohorts = orders_merged.groupby(['first_order', 'order_date'])['sales'].agg(['sum','count'])
orders_cohorts.head(12)
sum | count | ||
---|---|---|---|
first_order | order_date | ||
2014-01-03 | 2014-01-03 | 16.4 | 1 |
2014-11-12 | 153.1 | 1 | |
2015-04-18 | 209.6 | 1 | |
2015-11-24 | 383.6 | 1 | |
2016-05-15 | 7.8 | 1 | |
2016-11-14 | 37.6 | 1 | |
2017-03-04 | 89.6 | 1 | |
2017-05-14 | 88.0 | 1 | |
2017-08-26 | 65.0 | 1 | |
2014-01-04 | 2014-01-04 | 288.1 | 1 |
2015-09-25 | 183.4 | 1 | |
2015-11-20 | 344.4 | 1 |
We know how much the store has earned from each cohort over time.
First, we find out how many days passed between the first purchase and the next order, and delete those orders that happened after 365 days. To calculate the number of days between orders, subtract the first_order column from the order_date column.
# Create a filter with condition(boolean table)
year_1_filter = orders_merged['order_date'] - orders_merged['first_order'] <= '365 days'
year_1_filter
0 True 1 False 2 False 3 False 4 False ... 5004 True 5005 True 5006 False 5007 True 5008 False Length: 5009, dtype: bool
# Filter out unnecessary orders from the cohort table and store the result in the year_1_orders table.
# Just add the filter to brackets instead of a condition
year_1_orders = orders_merged[year_1_filter]
year_1_orders.head()
id | order_date | ship_mode | customer_id | sales | first_order | |
---|---|---|---|---|---|---|
0 | 100006 | 2014-09-07 | Standard | DK-13375 | 378.0 | 2014-09-07 |
8 | 100090 | 2014-07-08 | Standard | EB-13705 | 699.2 | 2014-07-08 |
9 | 129938 | 2014-12-15 | Second | EB-13705 | 445.8 | 2014-07-08 |
11 | 128125 | 2015-03-31 | Standard | EB-13705 | 120.8 | 2014-07-08 |
17 | 100293 | 2014-03-14 | Standard | NF-18475 | 91.1 | 2014-03-14 |
# Now let's group the orders by the date of the first purchase and order_date and calculate the 'sum' and 'count' metrics.
cohorts = year_1_orders.groupby(['first_order','order_date'])['sales'].agg(['sum', 'count'])
cohorts.head()
sum | count | ||
---|---|---|---|
first_order | order_date | ||
2014-01-03 | 2014-01-03 | 16.4 | 1 |
2014-11-12 | 153.1 | 1 | |
2014-01-04 | 2014-01-04 | 288.1 | 1 |
2014-01-05 | 2014-01-05 | 19.5 | 1 |
2014-01-06 | 2014-01-06 | 4,407.1 | 3 |
# Last step: calculate how many orders customers bring in on average during the first year.
# First sum the indicators of each cohort, and then average the values using the mean() method.
# Since we did not specify which particular column to calculate the average,
# he calculated the average amount and the average number of orders.
cohorts.groupby(['first_order']).sum().mean()
sum 1,949.9 count 4.0 dtype: float64