Merging tables is an essential preprocessing step for data analysis. Tables can be easily merged with SQL or SQL-like python functions. The pandas library offers multiple SQL-like and data table management functions.
The code snippet at the right shows how to merge database tables available for download in the Query tab.
import pandas as pd
import numpy as np
csv_list = [ 'path/to/hydraulics/csv/',
'path/to/subsurfacesed/csv/',
'path/to/ido/csv/',
'path/to/kf/csv/',
'path/to/kf/positions/csv'
]
vectorized_readdfs = np.vectorize(lambda x: pd.read_csv(x))
dfs = vectorized_readdfs(csv_list)
# merge IDO table with Kf table
df_global = dfs[-3].merge(dfs[-2], on=['meas_position', 'sample_id', 'dp_position'], how='outer')
df_global = df_global.drop_duplicates(subset=['sample_id', 'dp_position'])
# Average along the depth the IDO and Kf depth-profile value
df_avg_ido_kf = df_global.groupby('meas_position', as_index=False).mean()
# Merge the depth-explicit dataframe with the positions table
df_global = dfs[-1].merge(df_global, left_on='name', right_on='meas_position')
# Merge the average dataframe with the positions table
df_global_avg = dfs[-1].merge(df_avg_ido_kf, left_on='name', right_on='meas_position')
models = [ 'Hydraulics',
'SubsurfaceSed',
'IDO',
'Kf',
'MeasPosition']
suffixes = {'SubsurfaceSed': 'subsurf', 'Hydraulics': 'hyd'}
# Merge hydraulics and subsurface sediment data
for i, df in enumerate(dfs[0:-3]):
df_global = df_global.merge(df, on='meas_position',
how='outer',
suffixes=('', '_{}'.format(suffixes[models[i]])))
df_global_avg = df_global_avg.merge(df, on='meas_position',
how='outer',
suffixes=('', '_{}'.format(suffixes[models[i]])))
# Save the dataframes
df_global.to_csv('path/to/save/depth-explicit/global/table')
df_global_avg.to_csv('path/to/save/depth-explicit/global/table')
The code snippet on the right produces the correlation plots shown on the top. To avoid biasing the analysis, the df_global_avg
is used, where the
depth-explicit parameters (hydraulic conductivity kf and interstitial dissolved oxygen IDO) were depth-averaged to produce one average value for each
measurement position.
import plotly.express as px
# Columns to remove from global table
takeoff_cols = ['id_surf', 'id_subsurf', 'id_x', 'id_y', 'id', 'x', 'y', 'x_epsg', 'y_epsg',
'x_epsg4326', 'y_epsg4326', 'sediment_depth_m_x', 'comment_x', 'comment_y',
'dp_position', 'sediment_depth_m_y', ]
# Remove unnecessary columns
df_final = df_global.loc[:, ~df_global.columns.isin(takeoff_cols)]
# Compute spearman correlation matrix with
df_corr = df_final.corr(method='spearman').round(1)
depth_explicit_feats = ['kf_ms', 'slurp_rate_avg_mls',
'idoc_mgl', 'idoc_sat', 'temp_c']
# Correlation matrix with averaged IDO and kf parameters
fig = px.imshow(df_corr.loc[depth_explicit_feats, :], text_auto=True, aspect='auto')
The code snipped on the right describes the method for performing PCA using sklearn
, whose results are shown above.
from sklearn.decomposition import PCA
# Preparing df for dimensionality reduction
features = ['idoc_mgl', 'wl_m', 'slurp_rate_avg_mls', 'river',
'd84', 'dm', 'geom_std_grain',
'percent_finer_2mm',
'percent_finer_0_5mm',
'name'
]
df4pca = df_global_avg[features].dropna()
df4pca_prescaling = df4pca.drop(['river', 'name'], axis=1)
# Scaling with s-score approach
df4pca_final = df4pca_prescaling.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
# Build PCA
pca = PCA(n_components=3)
components = pca.fit_transform(df4pca_final)
variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(variance)
total_var2d = variance.sum() * 100
# Get labels for explained variance respective to the PC
labels = {
str(i): f"PC {i + 1} ({var:.1f}%)"
for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}
# Plot loadings in 2 PCs
fig2d = px.scatter_matrix(
components,
labels=labels,
dimensions=range(3),
color=df4pca["river"],
symbol=df4pca['river'],
title=f'Total Explained Variance: {total_var2d:.2f}%',
color_discrete_sequence=px.colors.qualitative.Bold,
width=1000, height=700,
hover_name=df4pca['name'],
)
fig2d.update_traces(diagonal_visible=False)