Alex Knittel - Data Scientist

Code

    def analyze_missing(df):
        '''analysis of missing values'''
        num_rows = df.shape[0]
        num_columns = df.shape[1]
        
        # percentage missing by column:
        columns = 1 - df.count()/num_rows
        print 'Top 10 columns with missing data:\n',
                columns.sort_values(ascending=False)[:10]
        plt.hist(columns)
        plt.title('Percentage of Missing Values for each Feature')
        
        # percentage of rows containing missing data:
        rows = 1 - df.dropna().shape[0] / float(num_rows)
        print 'Percentage of rows containing missing data:', rows
        
        # percentage of whole dataframe that is missing:
        whole = 1 - df.count().sum() / float(num_rows*num_columns)
        print 'Percentage missing for all data:', whole