Spearman Correlation Matrix


Principal Component Analysis (PCA)





Code snippets



Preprocessing data

Merging tables is an essential preprocessing step for data analysis. Tables can be easily merged with SQL or SQL-like python functions. The pandas library offers multiple SQL-like and data table management functions.

The code snippet at the right shows how to merge database tables available for download in the Query tab.

                
    import pandas as pd
    import numpy as np

    csv_list = [    'path/to/hydraulics/csv/',
                    'path/to/subsurfacesed/csv/',
                    'path/to/ido/csv/',
                    'path/to/kf/csv/',
                    'path/to/kf/positions/csv'
    ]

    vectorized_readdfs = np.vectorize(lambda x: pd.read_csv(x))
    dfs = vectorized_readdfs(csv_list)

    # merge IDO table with Kf table
    df_global = dfs[-3].merge(dfs[-2], on=['meas_position', 'sample_id', 'dp_position'], how='outer')
    df_global = df_global.drop_duplicates(subset=['sample_id', 'dp_position'])

    # Average along the depth the IDO and Kf depth-profile value
    df_avg_ido_kf = df_global.groupby('meas_position', as_index=False).mean()

    # Merge the depth-explicit dataframe with the positions table
    df_global = dfs[-1].merge(df_global, left_on='name', right_on='meas_position')

    # Merge the average dataframe with the positions table
    df_global_avg = dfs[-1].merge(df_avg_ido_kf, left_on='name', right_on='meas_position')


    models = [      'Hydraulics',
                    'SubsurfaceSed',
                    'IDO',
                    'Kf',
                    'MeasPosition']

    suffixes = {'SubsurfaceSed': 'subsurf', 'Hydraulics': 'hyd'}

    # Merge hydraulics and subsurface sediment data
    for i, df in enumerate(dfs[0:-3]):
        df_global = df_global.merge(df, on='meas_position',
                                    how='outer',
                                    suffixes=('', '_{}'.format(suffixes[models[i]])))
        df_global_avg = df_global_avg.merge(df, on='meas_position',
                                            how='outer',
                                            suffixes=('', '_{}'.format(suffixes[models[i]])))
    # Save the dataframes
    df_global.to_csv('path/to/save/depth-explicit/global/table')
    df_global_avg.to_csv('path/to/save/depth-explicit/global/table')
            
            

Spearman correlation analysis

The code snippet on the right produces the correlation plots shown on the top. To avoid biasing the analysis, the df_global_avg is used, where the depth-explicit parameters (hydraulic conductivity kf and interstitial dissolved oxygen IDO) were depth-averaged to produce one average value for each measurement position.

                
    import plotly.express as px

    # Columns to remove from global table
    takeoff_cols = ['id_surf', 'id_subsurf', 'id_x', 'id_y', 'id', 'x', 'y', 'x_epsg', 'y_epsg',
                    'x_epsg4326', 'y_epsg4326', 'sediment_depth_m_x', 'comment_x', 'comment_y',
                    'dp_position', 'sediment_depth_m_y', ]

    # Remove unnecessary columns
    df_final = df_global.loc[:, ~df_global.columns.isin(takeoff_cols)]

    # Compute spearman correlation matrix with
    df_corr = df_final.corr(method='spearman').round(1)

    depth_explicit_feats = ['kf_ms', 'slurp_rate_avg_mls',
                            'idoc_mgl', 'idoc_sat', 'temp_c']

    # Correlation matrix with averaged IDO and kf parameters
    fig = px.imshow(df_corr.loc[depth_explicit_feats, :], text_auto=True, aspect='auto')
            
            

Principal Component analysis

The code snipped on the right describes the method for performing PCA using sklearn, whose results are shown above.

                
    from sklearn.decomposition import PCA

    # Preparing df for dimensionality reduction
    features = ['idoc_mgl', 'wl_m', 'slurp_rate_avg_mls', 'river',
                'd84', 'dm', 'geom_std_grain',
                'percent_finer_2mm',
                'percent_finer_0_5mm',
                'name'
                ]

    df4pca = df_global_avg[features].dropna()
    df4pca_prescaling = df4pca.drop(['river', 'name'], axis=1)

    # Scaling with s-score approach
    df4pca_final = df4pca_prescaling.apply(lambda x: (x - x.mean()) / x.std(), axis=0)

    # Build PCA
    pca = PCA(n_components=3)
    components = pca.fit_transform(df4pca_final)
    variance = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(variance)
    total_var2d = variance.sum() * 100

    # Get labels for explained variance respective to the PC
    labels = {
        str(i): f"PC {i + 1} ({var:.1f}%)"
        for i, var in enumerate(pca.explained_variance_ratio_ * 100)
    }

    # Plot loadings in 2 PCs
    fig2d = px.scatter_matrix(
        components,
        labels=labels,
        dimensions=range(3),
        color=df4pca["river"],
        symbol=df4pca['river'],
        title=f'Total Explained Variance: {total_var2d:.2f}%',
        color_discrete_sequence=px.colors.qualitative.Bold,
        width=1000, height=700,
        hover_name=df4pca['name'],
    )
    fig2d.update_traces(diagonal_visible=False)