Flights - the sample project (Python | Pandas)
A sample data project to analyze, clean and explore Flight data from Raw Github CSS dataset. Python
Load dataset
import pandas as pd
!wget https://raw.githubusercontent.com/MicrosoftDocs/ml-basics/master/challenges/data/flights.csv
df_flights = pd.read_csv('flights.csv')
Missing Data
Find how many NULL values there are for each column, then explore a bit further, and finally replace NULL values.
df_flights.isnull().sum()
df_flights[df_flights.isnull().any(axis=1)][['DepDelay','DepDel15']]
df_flights[df_flights.isnull().any(axis=1)].DepDelay.describe()
df_flights.DepDel15 = df_flights.DepDel15.fillna(0)
df_flights.isnull().sum()
Outliers
View the distribution and summary statistics for the DepDelay and ArrDelay columns.
# Function to show summary stats and distribution for a column
def show_distribution(var_data):
from matplotlib import pyplot as plt
# Get statistics
min_val = var_data.min()
max_val = var_data.max()
mean_val = var_data.mean()
med_val = var_data.median()
mod_val = var_data.mode()[0]
print(var_data.name,'\nMinimum:{:.2f}\nMean:{:.2f}\nMedian:{:.2f}\nMode:{:.2f}\nMaximum:{:.2f}\n'.format(min_val, mean_val, med_val, mod_val, max_val))
# Create a figure for 2 subplots (2 rows, 1 column)
fig, ax = plt.subplots(2, 1, figsize = (10,4))
# Plot the histogram
ax[0].hist(var_data)
ax[0].set_ylabel('Frequency')
# Add lines for the mean, median, and mode
ax[0].axvline(x=min_val, color = 'gray', linestyle='dashed', linewidth = 2)
ax[0].axvline(x=mean_val, color = 'cyan', linestyle='dashed', linewidth = 2)
ax[0].axvline(x=med_val, color = 'red', linestyle='dashed', linewidth = 2)
ax[0].axvline(x=mod_val, color = 'yellow', linestyle='dashed', linewidth = 2)
ax[0].axvline(x=max_val, color = 'gray', linestyle='dashed', linewidth = 2)
# Plot the boxplot
ax[1].boxplot(var_data, vert=False)
ax[1].set_xlabel('Value')
# Add a title to the Figure
fig.suptitle(var_data.name)
# Show the figure
fig.show()
# Call the function for each delay field
delayFields = ['DepDelay','ArrDelay']
for col in delayFields:
show_distribution(df_flights[col])
There are a outliers at the lower and upper ends of both variables - particularly at the upper end.
Let's trim the data so that we include only rows where the values for these fields are within the 1st and 90th percentile.
# Trim outliers for ArrDelay based on 1% and 90% percentiles
ArrDelay_01pcntile = df_flights.ArrDelay.quantile(0.01)
ArrDelay_90pcntile = df_flights.ArrDelay.quantile(0.90)
df_flights = df_flights[df_flights.ArrDelay < ArrDelay_90pcntile]
df_flights = df_flights[df_flights.ArrDelay > ArrDelay_01pcntile]
# Trim outliers for DepDelay based on 1% and 90% percentiles
DepDelay_01pcntile = df_flights.DepDelay.quantile(0.01)
DepDelay_90pcntile = df_flights.DepDelay.quantile(0.90)
df_flights = df_flights[df_flights.DepDelay < DepDelay_90pcntile]
df_flights = df_flights[df_flights.DepDelay > DepDelay_01pcntile]
# View the revised distributions
for col in delayFields:
show_distribution(df_flights[col])
Explore
Let's start with an overall view of the summary statistics for the numeric columns, then answer a few questions regarding to the data.
# Overall view of the summary statistics for the numeric columns.
df_flights.describe()
# What are the mean departure and arrival delays?
df_flights[delayFields].mean()
# How do the carriers compare in terms of arrival delay performance?
for col in delayFields:
df_flights.boxplot(column=col, by='Carrier', figsize=(8,8))
# Are some days of the week more prone to arrival days than others?
for col in delayFields:
df_flights.boxplot(column=col, by='DayOfWeek', figsize=(8,8))
# Which departure airport has the highest average departure delay?
departure_airport_group = df_flights.groupby(df_flights.OriginAirportName)
mean_departure_delays = pd.DataFrame(departure_airport_group['DepDelay'].mean()).sort_values('DepDelay', ascending=False)
mean_departure_delays.plot(kind = "bar", figsize=(12,12))
mean_departure_delays
# Do late departures tend to result in longer arrival delays than on-time departures?
df_flights.boxplot(column='ArrDelay', by='DepDel15', figsize=(12,12))
# Which route (from origin airport to destination airport) has the most late arrivals?
# Add a routes column
routes = pd.Series(df_flights['OriginAirportName'] + ' > ' + df_flights['DestAirportName'])
df_flights = pd.concat([df_flights, routes.rename("Route")], axis=1)
# Group by routes
route_group = df_flights.groupby(df_flights.Route)
pd.DataFrame(route_group['ArrDel15'].sum()).sort_values('ArrDel15', ascending=False)
Results screenshots Coming soon