Industry average bar plots and scatter plots – Python

Final code:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Loading the data and printing out its properties ---------------------------
df = pd.read_csv('companies.csv')
print df.shape
print df.info()
print df.describe()

# Data wrangling -------------------------------------------------------------

# Fix the CA column
df.loc[df['CA'] == 'Cliare', 'CA'] = 'Claire'

# Drop the outlier
df = df[df['Renewal fee'] < 9000]

# Fix the 'Risk' column
df.loc[df['Risk'] == 'Very High', 'Risk'] = 'Very high'

# Plotting -------------------------------------------------------------------
plt.figure
plt.gcf().subplots_adjust(bottom=0.15)

#Revenue per industry --------------------------------------------------------
industry_unique = df['Industry'].unique()

industry_count = []
industry_revenue = []
# Listing industry counts and revenues
for el in industry_unique:
	industry_count.append(len(df[df['Industry'] == el]))
	industry_revenue.append(df.loc[df['Industry'] == el, 'Renewal fee'].sum())

# Calculating averages per industry
industry_average = []
for i in range(len(industry_unique)):
	industry_average.append(industry_revenue[i] / industry_count[i])

# Ordering industry averages by value
industry_order = {}
# Adding industry unique names and correponding averages to industry_order dictionary
for i in range(len(industry_unique)):
    industry_order[industry_unique[i]] = industry_average[i]
# Sorting industry_unique by industry averages using industry_order
industry_unique = sorted(industry_unique, key=lambda x: industry_order [x])
# Sorting industry_average by value
industry_average = sorted(industry_average)

ax1 = plt.subplot(121)
xs = np.arange(len(industry_unique))
plt.xticks(xs, industry_unique)
plt.xticks(rotation = 30)
plt.bar(xs, industry_average, color = '#112244')
plt.grid(True, linestyle = '--', alpha = 0.6)
ax1.set_axisbelow(True)
plt.title('Average revenue per industry')

#Revenue per risk bar --------------------------------------------------------
risk_unique = df['Risk'].unique()

# Ordering risk_unique from 'Low' to 'Very high'
risk_order = {'Low': '0', 'Moderate': '1', 'High': '2', 'Very high': '3'}
risk_unique = sorted(risk_unique, key=lambda x: risk_order [x])

# Listing risk counts and revenues
risk_count = []
risk_revenue = []
for el in risk_unique:
	risk_count.append(len(df[df['Risk'] == el]))
	risk_revenue.append(df.loc[df['Risk'] == el, 'Renewal fee'].sum())

# Calculating averages per risk
risk_average = []
for i in range(len(risk_unique)):
	risk_average.append(risk_revenue[i] / risk_count[i])

ax2 = plt.subplot(122, sharey = ax1)
xs = np.arange(len(risk_unique))
plt.xticks(xs, risk_unique)
plt.xticks(rotation = 30)
plt.bar(xs, risk_average, color = '#881122')
plt.grid(True, linestyle = '--', alpha = 0.6)
ax2.set_axisbelow(True)
plt.title('Average revenue per risk')
plt.savefig('Revenues.jpg')

#Revenue per risk scatter ----------------------------------------------------
# Replacing 'Risk' values with numbers
df['Risk'].replace(risk_order, inplace= True) 

# Creating Dataframe to hold the unique 'Risk'-'Renewal fee' pairs with their number of occurances
df_risk = df.groupby(['Risk', 'Renewal fee']).size().reset_index(name="Size")

# X coordinate is 'Renewal fee'
xs = df_risk['Renewal fee']
# Y coordinate is 'Risk'
ys = df_risk['Risk']
# Size of marker is the number of occurances of the above X and Y
s = df_risk['Size']

plt.figure(2)
plt.gcf().subplots_adjust(bottom=0.15)
ax = plt.subplot(111)
x_range = np.arange(1000, 4000, 250)
plt.xticks(x_range, rotation = 30)
plt.scatter(xs, ys, s = s*300, alpha = 0.9, color = '#cc1122', edgecolor = '#000000')
plt.grid(True, linestyle = '-', alpha = 0.6)
ax.set_axisbelow(True)
plt.ylabel('Risk')
plt.title('Revenue per risk scatter')
plt.savefig('Revenue_per_risk.jpg')

# Risk per industry scatter --------------------------------------------------

# Replacing 'Industry' values with numbers
industry_numeral = {'Computers': '0', 'General': '1', 'Mining': '2', 'Robotics': '3', 'Transport': '4'}
df['Industry'].replace(industry_numeral, inplace= True)
df['Industry'] = df['Industry'].astype(int)

# Creating Dataframe to hold the unique 'Risk'-'Industry' pairs with their number of occurances
df_industry = df.groupby(['Risk', 'Industry']).size().reset_index(name="Size")

# X coordinate is 'Industry'
xs = df_industry['Industry']
# Y coordinate is 'Risk'
ys = df_industry['Risk']
# Size of marker is the number of occurances of the above X and Y
s = df_risk['Size']

plt.figure(3)
plt.gcf().subplots_adjust(bottom=0.15)
ax = plt.subplot(111)
x_range = np.arange(5)
y_range = np.arange(4)
plt.xticks(x_range, ('Computers', 'General', 'Mining', 'Robotics', 'Transport'), rotation = 30)
plt.yticks(y_range, ('Low', 'Moderate', 'High', 'Very high'))
plt.scatter(xs, ys, s = s*300, alpha = 0.9, color = '#1122cc', edgecolor = '#000000')
plt.grid(True, linestyle = '-', alpha = 0.6)
ax.set_axisbelow(True)
plt.ylabel('Risk')
plt.title('Risk per Industry scatter')
plt.savefig('Risk_per_Industry.jpg')

# Risk and industry revenue--------------------------------------------------

plt.figure(4)
plt.gcf().subplots_adjust(bottom=0.15)
ax = plt.subplot(121)
x_range = np.arange(5)
y_range = np.arange(4)
plt.xticks(x_range, ('Computers', 'General', 'Mining', 'Robotics', 'Transport'), rotation = 30)
plt.yticks(y_range, ('Low', 'Moderate', 'High', 'Very high'))
plt.scatter(xs, ys, s = s*300, alpha = 0.9, color = '#cc1122', edgecolor = '#000000')
plt.grid(True, linestyle = '-', alpha = 0.6)
ax.set_axisbelow(True)
plt.ylabel('Risk')
plt.title('Risk per Industry scatter')

ax1 = plt.subplot(123)
xs = np.arange(len(industry_unique))
plt.xticks(xs, industry_unique)
plt.xticks(rotation = 30)
plt.bar(xs, industry_average, color = '#112244')
plt.grid(True, linestyle = '--', alpha = 0.6)
ax1.set_axisbelow(True)
plt.title('Average revenue per industry')
plt.savefig('Risk_and_Industry_revenue.jpg')

I finished the previous part with the plot below.Per_month plot
Looks like the company’s revenues are falling but is this really the case? You can see that both charts have the same shape. It means that the same time the revenues changed the number of policies changed too, so the renewal fees per policy don’t change. It’s the change in the number of policies that causes this fluctuation. Also, the number of policies and the total revenue for February and May is almost the same.

As a data analyst or data scientist it is not necessary my job to answer the question: why this is happening? I have found a pattern in the data and now I can present it to the relevant department. The guys there have more information about their processes and based on my findings they can investigate further.

However in this case the answer may be very simple. April is the start of the new financial year, it is quite possible that what I see here is an end of year rush to renew insurance policies and the drop in May is just business returning to its normal flow.
Maybe one or two client advisors went to holiday and the business just doesn’t have the capacity to deal with renewals. In this case they probably want to address this issue.

Now let’s plot some more.

You can get the dataset from: https://github.com/zmraz/data-science/tree/master/datasets

I am looking for patterns and connections in the data and since I have a ‘Renewal fee’ column I can try to find out how my company can increase its revenue from these fees.

Is there any connection between the renewal fee and the other columns? I have already dropped two columns from the original dataframe as the values in them were all unique and thus I couldn’t use them to group my data.

After looking at the remaining columns I think I start working on the ‘Industry’ column. My theory is that there may be industries where the companies pay higher renewal fees.

I am going to plot industry averages, first summing the revenues per industry and divide them by the number of companies per industry.


#Revenue per industry
industry_unique = df['Industry'].unique()

industry_count = []
industry_revenue = []
for el in industry_unique:
	industry_count.append(len(df[df['Industry'] == el]))
	industry_revenue.append(df.loc[df['Industry'] == el, 'Renewal fee'].sum())

industry_average = []
for i in range(len(industry_unique)):
	industry_average.append(industry_revenue[i] / industry_count[i])

industry_order = {}
for i in range(len(industry_unique)):
    industry_order[industry_unique[i]] = industry_average[i]

industry_unique = sorted(industry_unique, key=lambda x: industry_order [x])
industry_average = sorted(industry_average)

plt.subplot(121, axisbg = '#119922')
xs = np.arange(len(industry_unique))
plt.xticks(xs, industry_unique)
plt.xticks(rotation = 70)
plt.bar(xs, industry_average, alpha = 0.7, color = '#112299')
plt.grid(True, linestyle = '--', alpha = 0.6)
plt.title('Average revenue per industry')

Now I do the same using the ‘Risk’ column. After all higher risk could mean higher fees.


#Revenue per risk
risk_unique = df['Risk'].unique()

risk_count = []
risk_revenue = []
for el in risk_unique:
	risk_count.append(len(df[df['Risk'] == el]))
	risk_revenue.append(df[loc[df['Risk'] == el, 'Renewal fee'].sum())

risk_average = []
for i in range(len(risk_unique):
	risk_average.append(risk_revenue[i] / risk_count[i])

plt.subplot(122, facecolor = '#999911')
xs = np.arange(len(risk_unique))
plt.xticks(xs, risk_unique)
plt.xticks(rotation = 70)
plt.bar(xs, risk_average, alpha = 0.7, color = '#881122')
plt.grid(True, linestyle = '--', alpha = 0.6)
plt.title('Average revenue per risk')

Revenues plot


It looks promising. I can see that different type of industries have different average renewal fee and the renewal fee progressively gets higher as the risk increases.

Just for fun and exercise I plot the ‘Risk’ and the ‘Renewal fee’ column in a different way. I use a scatter plot this time and instead of averages I will plot each renewal fee. The X axis will be the ‘Renewal fee’ and the Y axis is the ‘Risk’ columns. Dots will represent policies with the corresponding risk and renewal fee. The bigger the dots the policies we have at that point.


#Revenue per risk scatter ----------------------------------------------------

# Replacing 'Risk' values with numbers
df['Risk'].replace(risk_order, inplace= True) 

# Replacing 'Industry' values with numbers
industry_numeral = {'Computers': '0', 'General': '1', 'Mining': '2', 'Robotics': '3', 'Transport': '4'}
df['Industry'].replace(industry_numeral, inplace= True) 

# Creating Dataframe to hold the unique 'Risk'-'Renewal fee' pairs with their number of occurances
df_risk = df.groupby(['Risk', 'Renewal fee']).size().reset_index(name="Size")

# X coordinate is 'Renewal fee'
xs = df_risk['Renewal fee']
# Y coordinate is 'Risk'
ys = df_risk['Risk']
# Size of marker is the number of occurrences of the above X and Y
s = df_risk['Size']

plt.figure(2)
ax = plt.subplot(111)
x_range = np.arange(1000, 4000, 250)
plt.xticks(x_range, rotation = 30)
plt.scatter(xs, ys, s = s*300, alpha = 0.9, color = '#cc1122', edgecolor = '#000000')
plt.grid(True, linestyle = '-', alpha = 0.6)
ax.set_axisbelow(True)
plt.ylabel('Risk')
plt.title('Revenue per risk scatter')
plt.savefig('Revenue_per_risk.jpg')

Revenue_per_risk plot


The above plot is extremely useful! It sure shows that higher risk clients are paying higher renewal fees but I can also see that I can find clients with lower risk paying high renewal fees. There is the smaller dot at the top of the plot: it has a risk level of 3 (Very high) and the renewal fee is 2,500.00 while I can see another dot with risk level 1 (Moderate) paying 3,000.00!

How can I find these clients? Is there a relation in the data I can use?

I saw that different industry branches have different average renewal fees. Maybe it depends on which industry the client is working in?

The question I need to ask: are the revenues in the ‘Average revenue per industry’ plot different because the companies in certain industry branches are willing to pay higher fees or simply different industry branches have different risk levels which increases the policy fees?

In other words: are the ‘Industry’ and ‘Risk’ features are independent of each other or not?

Let’s draw another plot! This time a scatter plot


# Risk per industry scatter --------------------------------------------------

# Replacing 'Industry' values with numbers
industry_numeral = {'Computers': '0', 'General': '1', 'Mining': '2', 'Robotics': '3', 'Transport': '4'}
df['Industry'].replace(industry_numeral, inplace= True) 

# Creating Dataframe to hold the unique 'Risk'-'Industry' pairs with their number of occurances
df_industry = df.groupby(['Risk', 'Industry']).size().reset_index(name="Size")

# X coordinate is 'Industry'
xs = df_industry['Industry']
# Y coordinate is 'Risk'
ys = df_industry['Risk']
# Size of marker is the number of occurances of the above X and Y
s = df_risk['Size']

plt.figure(2)
ax = plt.subplot(111)
x_range = np.arange(5)
y_range = np.arange(4)
plt.xticks(x_range, ('Computers', 'General', 'Mining', 'Robotics', 'Transport'), rotation = 30)
plt.yticks(y_range, ('Low', 'Moderate', 'High', 'Very high'))
plt.scatter(xs, ys, s = s*300, alpha = 0.9, color = '#cc1122', edgecolor = '#000000')
plt.grid(True, linestyle = '-', alpha = 0.6)
ax.set_axisbelow(True)
plt.ylabel('Risk')
plt.title('Risk per Industry scatter')
plt.savefig('Risk_per_Industry.jpg')

Risk_per_Industry plot


This looks really promising!

What am I looking at? At the bottom of the plot are client with low risk, the bigger the marker the more clients are there. Reading from left to right I can see that all clients belonging to the ‘Computers’ industry category are ‘Low’ or ‘Moderate’ risk. The same is true for ‘General’. Clients in ‘Mining’ however have ‘Very high’ risk category. ‘Robotics’ are the same as ‘Computers’ and ‘General’ and ‘Transport’ clients are kind of all over the place but in general they are more risky.

Since I am looking for clients with low risk and high renewal fees the industries I am interested in are ‘Computers’, ‘General’, and ‘Robotics’.

I redraw this last plot and the bar plot ‘Average revenue per industry’ next to each other.


# Risk and industry revenue--------------------------------------------------
plt.figure(4)
plt.gcf().subplots_adjust(bottom=0.15)
ax = plt.subplot(121)
x_range = np.arange(5)
y_range = np.arange(4)
plt.xticks(x_range, ('Computers', 'General', 'Mining', 'Robotics', 'Transport'), rotation = 30)
plt.yticks(y_range, ('Low', 'Moderate', 'High', 'Very high'))
plt.scatter(xs, ys, s = s*300, alpha = 0.9, color = '#cc1122', edgecolor = '#000000')
plt.grid(True, linestyle = '-', alpha = 0.6)
ax.set_axisbelow(True)
plt.ylabel('Risk')
plt.title('Risk per Industry scatter')

ax1 = plt.subplot(123)
xs = np.arange(len(industry_unique))
plt.xticks(xs, industry_unique)
plt.xticks(rotation = 30)
plt.bar(xs, industry_average, color = '#112244')
plt.grid(True, linestyle = '--', alpha = 0.6)
ax1.set_axisbelow(True)
plt.title('Average revenue per industry')
plt.savefig('Risk_and_Industry_revenue.jpg')

Risk_and_Industry_revenue plot 


Conclusion: while ‘Computers’, ‘General’, and ‘Robotics’ all have lower level risk clients, ‘General’ also has the lowest average of renewal fees too. If the company wants to focus on clients with high renewal fee and low risk it should focus on clients who belong to ‘Computers’ or ‘Robotics’ industries, ‘Computers’ being the best choice.


Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s