Flights - the sample project (Python | Pandas)

A sample data project to analyze, clean and explore Flight data from Raw Github CSS dataset. Python

Load dataset

              
                import pandas as pd
                !wget https://raw.githubusercontent.com/MicrosoftDocs/ml-basics/master/challenges/data/flights.csv
                df_flights = pd.read_csv('flights.csv')
              
            

Missing Data

Find how many NULL values there are for each column, then explore a bit further, and finally replace NULL values.

              
                df_flights.isnull().sum()
                df_flights[df_flights.isnull().any(axis=1)][['DepDelay','DepDel15']]
                df_flights[df_flights.isnull().any(axis=1)].DepDelay.describe()
                df_flights.DepDel15 = df_flights.DepDel15.fillna(0)
                df_flights.isnull().sum()
              
            

Outliers

View the distribution and summary statistics for the DepDelay and ArrDelay columns.

              
                # Function to show summary stats and distribution for a column
def show_distribution(var_data):
        from matplotlib import pyplot as plt

    # Get statistics
    min_val = var_data.min()
    max_val = var_data.max()
    mean_val = var_data.mean()
    med_val = var_data.median()
    mod_val = var_data.mode()[0]

    print(var_data.name,'\nMinimum:{:.2f}\nMean:{:.2f}\nMedian:{:.2f}\nMode:{:.2f}\nMaximum:{:.2f}\n'.format(min_val, mean_val, med_val, mod_val, max_val))

    # Create a figure for 2 subplots (2 rows, 1 column)
    fig, ax = plt.subplots(2, 1, figsize = (10,4))

    # Plot the histogram
    ax[0].hist(var_data)
    ax[0].set_ylabel('Frequency')

    # Add lines for the mean, median, and mode
    ax[0].axvline(x=min_val, color = 'gray', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=mean_val, color = 'cyan', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=med_val, color = 'red', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=mod_val, color = 'yellow', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=max_val, color = 'gray', linestyle='dashed', linewidth = 2)

    # Plot the boxplot
    ax[1].boxplot(var_data, vert=False)
    ax[1].set_xlabel('Value')

    # Add a title to the Figure
    fig.suptitle(var_data.name)

    # Show the figure
    fig.show()

# Call the function for each delay field
delayFields = ['DepDelay','ArrDelay']
for col in delayFields:
        show_distribution(df_flights[col])
              
            

There are a outliers at the lower and upper ends of both variables - particularly at the upper end.

Let's trim the data so that we include only rows where the values for these fields are within the 1st and 90th percentile.

              
                # Trim outliers for ArrDelay based on 1% and 90% percentiles
          ArrDelay_01pcntile = df_flights.ArrDelay.quantile(0.01)
          ArrDelay_90pcntile = df_flights.ArrDelay.quantile(0.90)
          df_flights = df_flights[df_flights.ArrDelay < ArrDelay_90pcntile]
          df_flights = df_flights[df_flights.ArrDelay > ArrDelay_01pcntile]

          # Trim outliers for DepDelay based on 1% and 90% percentiles
          DepDelay_01pcntile = df_flights.DepDelay.quantile(0.01)
          DepDelay_90pcntile = df_flights.DepDelay.quantile(0.90)
          df_flights = df_flights[df_flights.DepDelay < DepDelay_90pcntile]
          df_flights = df_flights[df_flights.DepDelay > DepDelay_01pcntile]

          # View the revised distributions
          for col in delayFields:
            show_distribution(df_flights[col])
              
            

Explore

Let's start with an overall view of the summary statistics for the numeric columns, then answer a few questions regarding to the data.

              
                # Overall view of the summary statistics for the numeric columns.
          df_flights.describe()

          # What are the mean departure and arrival delays?
          df_flights[delayFields].mean()

          # How do the carriers compare in terms of arrival delay performance?
          for col in delayFields:
            df_flights.boxplot(column=col, by='Carrier', figsize=(8,8))

          # Are some days of the week more prone to arrival days than others?
          for col in delayFields:
            df_flights.boxplot(column=col, by='DayOfWeek', figsize=(8,8))

          # Which departure airport has the highest average departure delay?
          departure_airport_group = df_flights.groupby(df_flights.OriginAirportName)

          mean_departure_delays = pd.DataFrame(departure_airport_group['DepDelay'].mean()).sort_values('DepDelay', ascending=False)
          mean_departure_delays.plot(kind = "bar", figsize=(12,12))
          mean_departure_delays

          # Do late departures tend to result in longer arrival delays than on-time departures?
          df_flights.boxplot(column='ArrDelay', by='DepDel15', figsize=(12,12))

          # Which route (from origin airport to destination airport) has the most late arrivals?

          # Add a routes column
          routes  = pd.Series(df_flights['OriginAirportName'] + ' > ' + df_flights['DestAirportName'])
          df_flights = pd.concat([df_flights, routes.rename("Route")], axis=1)

          # Group by routes
          route_group = df_flights.groupby(df_flights.Route)
          pd.DataFrame(route_group['ArrDel15'].sum()).sort_values('ArrDel15', ascending=False)
              
            

Results screenshots Coming soon

Top