Validation neutralization assays versus polyclonal fits

Compare actual measured neutralization values for specific mutants to the polyclonal fits.

Import Python modules:

[1]:
import os
import pickle

import altair as alt

import pandas as pd
import numpy as np

import yaml

from scipy import stats

import warnings
warnings.simplefilter("ignore")

palette = ['#999999', '#0072B2',  '#E69F00', '#F0E442', '#009E73','#56B4E9', "#D55E00", "#CC79A7"]

extended_palette = ['#999999', '#0072B2',  '#E69F00', '#F0E442', '#009E73','#56B4E9', "#D55E00", "#CC79A7", '#9F0162']

long_palette = ['#9F0162', '#009F81', '#FF5AAF', '#8400CD', '#008DF9', '#00C2F9', '#FFB2FD', '#A40122', '#E20134', '#FF6E3A', '#FFC33B', '#00FCCF']

Read configuration and validation assay measurements:

[2]:
with open("config.yaml") as f:
    config = yaml.safe_load(f)

validation_ic50s = pd.read_csv(config["validation_ics"], na_filter=None)

validation_ic50s['max % neutralization'] = np.clip(validation_ic50s['max % neutralization'], a_min=0, a_max=100)

validation_ic50s
[2]:
antibody aa_substitutions max % neutralization measured IC50 MAP_fold_enrichment measured IC90 measured IC80
0 PGT151 100 0.065000
1 PGT151 N611S 98 0.187000 5.4
2 PGT151 S612P 81 0.150000 8
3 PGT151 N637A 81 0.046000 1
4 PGT151 N637K 59 0.092000 3.7
... ... ... ... ... ... ... ...
88 3BNC117 G459D 100 0.009945 0.043379 0.025188
89 3BNC117 R476M 100 0.002286 0.010977 0.005776
90 3BNC117 T202P 100 0.011801 0.086449 0.041454
91 3BNC117 N463S 100 0.014342 0.061920 0.036090
92 3BNC117 Q203P 100 0.004815 0.036248 0.017207

93 rows × 7 columns

Now get the predictions by the averaged polyclonal model fits:

[3]:
validation_vs_prediction = []
for antibody, antibody_df in validation_ic50s.groupby("antibody"):
    with open(os.path.join(config["escape_dir"], f"{antibody}.pickle"), "rb") as f:
        model = pickle.load(f)
    df = model.icXX(antibody_df)
    df = model.icXX(df, x=0.80, col="IC80")
    df = df.merge((model.mut_escape_df
                   .rename(columns={'mutation': 'aa_substitutions'})
                   [['aa_substitutions', 'times_seen']]
                  ), how='left', on='aa_substitutions')
    validation_vs_prediction.append(df)

validation_vs_prediction = pd.concat(validation_vs_prediction, ignore_index=True)

validation_vs_prediction
[3]:
antibody aa_substitutions max % neutralization measured IC50 MAP_fold_enrichment measured IC90 measured IC80 mean_IC50 median_IC50 std_IC50 frac_models mean_IC80 median_IC80 std_IC80 n_models times_seen
0 1-18 100 0.012082 0.082944 0.040739 0.144254 0.148228 0.044315 1.0 0.577016 0.592914 0.177258 3 NaN
1 1-18 G459D 100 0.004669 0.035790 0.016877 0.135608 0.140220 0.037858 1.0 0.542431 0.560881 0.151432 3 7.333333
2 1-18 N276D 100 0.005727 0.014183 0.007466 0.121682 0.120510 0.029322 1.0 0.486727 0.482039 0.117286 3 5.666667
3 1-18 N276D G459D 100 0.001909 0.007638 0.004369 0.114543 0.113999 0.024161 1.0 0.458173 0.455996 0.096642 3 NaN
4 1-18 N463S 100 0.018143 0.096897 0.052213 0.170952 0.192581 0.063604 1.0 0.683809 0.770325 0.254416 3 3.666667
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
93 PGT151 T639A 78 0.049000 1.4 0.117905 0.133545 0.034327 1.0 0.471619 0.534181 0.137308 3 9.333333
94 PGT151 T639K 74 0.041000 4.8 0.303773 0.286245 0.115182 1.0 1.215092 1.144980 0.460730 3 8.333333
95 PGT151 T639R 79 0.042000 2.8 0.122822 0.133414 0.041905 1.0 0.491289 0.533657 0.167619 3 4.333333
96 PGT151 T644A 96 0.052000 1 0.111485 0.125146 0.030082 1.0 0.445941 0.500585 0.120327 3 8.666667
97 PGT151 T644R 87 0.094000 4 0.280587 0.318181 0.188654 1.0 1.122348 1.272726 0.754615 3 4.000000

98 rows × 16 columns

For each antibody, calculate the Pearson correlation coefficient between the predicted IC50s from our models and the IC50s measured in validation assays. We are doing this first for only single mutants:

[4]:
print("Single mutant correlations between DMS predicted and neutralization assay measured IC50s/IC80s:")
for antibody, antibody_df in validation_vs_prediction.groupby('antibody'):
    if antibody == "PGT151":
        antibody_df = antibody_df.query("frac_models>=.5").query("aa_substitutions!=''")
        antibody_df = antibody_df[~antibody_df['aa_substitutions'].str.contains(" ")]
        print(f"{antibody} (IC50):")
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            antibody_df["median_IC50"].astype(float),
            antibody_df["measured IC50"].astype(float))
        print(round(r_value**2,3))
        print("PGT151 MAP_fold_enrichment correlation:")
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            antibody_df["MAP_fold_enrichment"].astype(float),
            antibody_df["measured IC50"].astype(float))
        print(round(r_value**2,3))
    else:
        antibody_df = antibody_df.query("frac_models>=.5").query("aa_substitutions!=''")
        antibody_df = antibody_df[~antibody_df['aa_substitutions'].str.contains(" ")]
        print(f"{antibody}:")
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            antibody_df["median_IC80"].astype(float),
            antibody_df["measured IC80"].astype(float))
        print(round(r_value**2,3))
Single mutant correlations between DMS predicted and neutralization assay measured IC50s/IC80s:
1-18:
0.186
3BNC117:
0.606
IDC508:
0.577
IDC513:
0.681
IDC561:
0.362
IDF033:
0.804
PGT151 (IC50):
0.813
PGT151 MAP_fold_enrichment correlation:
0.303

Now, plot the results. We will plot the median across the replicate polyclonal fits to different deep mutational scanning replicates. This is an interactive plot that you can mouse over for details:

[5]:
plot_data = validation_vs_prediction.query("frac_models>=.5").query('antibody!="PGT151"')
plot_data = plot_data[~plot_data['aa_substitutions'].str.contains(" ")]
plot_data['measured IC80'] = plot_data['measured IC80'].astype(float)
corr_chart = (
    alt.Chart(plot_data)
    .encode(
        x=alt.X("measured IC80", scale=alt.Scale(type="log", nice=False)),
        y=alt.Y(
            "median_IC80",
            title="predicted IC80 from DMS",
            scale=alt.Scale(type="log", nice=False),
        ),
        facet=alt.Facet("antibody", columns=4, title=None),
        color=alt.Color("aa_substitutions",
                        title="Amino acid substitutions",
                        scale=alt.Scale(scheme="tableau10")),
        tooltip=[
            alt.Tooltip(c, format=".3g") if validation_vs_prediction[c].dtype == float
            else c
            for c in validation_vs_prediction.columns.tolist()
        ],
    )
    .mark_circle(filled=True, size=60, opacity=1)
    .configure_axis(grid=False)
    .resolve_scale(y="independent", x="independent")
    .properties(width=150, height=150)
)

corr_chart
[5]:

Now also calculate the fold changes, using the median prediction:

[6]:
sera_validation_vs_prediction = validation_vs_prediction.query('antibody!="PGT151"').copy()
sera_validation_vs_prediction['median_IC80'] = sera_validation_vs_prediction['median_IC80'].astype(float)
sera_validation_vs_prediction['measured IC80'] = sera_validation_vs_prediction['measured IC80'].astype(float)


fold_changes = (
    sera_validation_vs_prediction
    .rename(columns={"median_IC80": "predicted IC80"})
#    .query("aa_substitutions != ''")
    [["antibody",
      "aa_substitutions",
      "measured IC80",
      "MAP_fold_enrichment",
      "predicted IC80",
      "max % neutralization",
      "times_seen",
      "frac_models",
      "n_models"]]
    .merge(
        sera_validation_vs_prediction
        .rename(columns={"median_IC80": "predicted IC80"})
        .query("aa_substitutions == ''")
        [["antibody", "measured IC80", "predicted IC80"]],
        on="antibody",
        how="left",
        validate="many_to_one",
        suffixes=[" mutant", " unmutated"],
    )
    .assign(
        measured_fold_change=lambda x: x["measured IC80 mutant"] / x["measured IC80 unmutated"],
        predicted_fold_change=lambda x: x["predicted IC80 mutant"] / x["predicted IC80 unmutated"],
    )
)

plot_data = fold_changes.query("frac_models>=.5").query('antibody!="PGT151"')
plot_data = plot_data[~plot_data['aa_substitutions'].str.contains(" ")]

fold_change_chart = (
    alt.Chart(plot_data.query("frac_models>=.5"))
    .encode(
        x=alt.X(
            "measured_fold_change",
            title="measured fold change IC80",
            scale=alt.Scale(type="log", nice=False),
        ),
        y=alt.Y(
            "predicted_fold_change",
            title="predicted fold change IC80",
            scale=alt.Scale(type="log", nice=False),
        ),
        facet=alt.Facet("antibody", columns=4, title=None),
        color=alt.Color("aa_substitutions",
                        title="Amino acid substitutions",
                        scale=alt.Scale(scheme="tableau10")),
        tooltip=[
            alt.Tooltip(c, format=".3g") if plot_data[c].dtype == float
            else c
            for c in plot_data.columns.tolist()
        ],
    )
    .mark_circle(filled=True, size=60, opacity=1)
    .configure_axis(grid=False)
    .resolve_scale(y="independent", x="independent")
    .properties(width=150, height=150)
)

for antibody in plot_data['antibody'].unique():
    antibody_df = fold_changes.query("frac_models>=.5").query("antibody==@antibody")
    antibody_df = antibody_df[~antibody_df['aa_substitutions'].str.contains(" ")]
    print(f"{antibody}:")
    slope, intercept, r_value, p_value, std_err = stats.linregress(
        antibody_df["predicted_fold_change"].astype(float),
        antibody_df["measured_fold_change"].astype(float))
    print(f"Predicted fold change correlation (R^2): {round(r_value**2,3)}")

fold_change_chart

1-18:
Predicted fold change correlation (R^2): 0.184
3BNC117:
Predicted fold change correlation (R^2): 0.614
IDC508:
Predicted fold change correlation (R^2): 0.587
IDC513:
Predicted fold change correlation (R^2): 0.671
IDC561:
Predicted fold change correlation (R^2): 0.257
IDF033:
Predicted fold change correlation (R^2): 0.82
[6]:

And now repeat these calculation and plots, this time including multi-mutants:

[7]:
print("All mutant correlations between DMS predicted and neutralization assay measured IC80s:")
for antibody, antibody_df in sera_validation_vs_prediction.groupby('antibody'):
    antibody_df = antibody_df.query("frac_models>=.5").query("aa_substitutions!=''")
    print(f"{antibody}:")
    slope, intercept, r_value, p_value, std_err = stats.linregress(
        antibody_df["median_IC80"].astype(float),
        antibody_df["measured IC80"].astype(float))
    print(round(r_value**2,3))
    if antibody == "PGT151":
        print("PGT151 MAP_fold_enrichment correlation:")
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            antibody_df["MAP_fold_enrichment"].astype(float),
            antibody_df["measured IC50"].astype(float))
        print(round(r_value**2,3))
All mutant correlations between DMS predicted and neutralization assay measured IC80s:
1-18:
0.074
3BNC117:
0.635
IDC508:
0.813
IDC513:
0.377
IDC561:
0.037
IDF033:
0.76
[8]:
plot_data = sera_validation_vs_prediction.query("frac_models>=.5").query('antibody!="PGT151"')

corr_chart = (
    alt.Chart(plot_data)
    .encode(
        x=alt.X("measured IC80", scale=alt.Scale(type="log", nice=False)),
        y=alt.Y(
            "median_IC80",
            title="predicted IC80 from DMS",
            scale=alt.Scale(type="log", nice=False),
        ),
        facet=alt.Facet("antibody", columns=4, title=None),
        color=alt.Color("aa_substitutions",
                        title="Amino acid substitutions",
                        scale=alt.Scale(scheme="tableau20")),
        tooltip=[
            alt.Tooltip(c, format=".3g") if validation_vs_prediction[c].dtype == float
            else c
            for c in validation_vs_prediction.columns.tolist()
        ],
    )
    .mark_circle(filled=True, size=60, opacity=1)
    .configure_axis(grid=False)
    .resolve_scale(y="independent", x="independent")
    .properties(width=150, height=150)
)

corr_chart
[8]:
[9]:
plot_data = fold_changes.query("frac_models>=.5").query('antibody!="PGT151"')

fold_change_chart = (
    alt.Chart(plot_data.query("frac_models>=.5"))
    .encode(
        x=alt.X(
            "measured_fold_change",
            title="measured fold change IC80",
            scale=alt.Scale(type="log", nice=False),
        ),
        y=alt.Y(
            "predicted_fold_change",
            title="predicted fold change IC80",
            scale=alt.Scale(type="log", nice=False),
        ),
        facet=alt.Facet("antibody", columns=4, title=None),
        color=alt.Color("aa_substitutions",
                        title="Amino acid substitutions",
                        scale=alt.Scale(scheme="tableau20")),
        tooltip=[
            alt.Tooltip(c, format=".3g") if plot_data[c].dtype == float
            else c
            for c in plot_data.columns.tolist()
        ],
    )
    .mark_circle(filled=True, size=60, opacity=1)
    .configure_axis(grid=False)
    .resolve_scale(y="independent", x="independent")
    .properties(width=150, height=150)
)

for antibody in plot_data['antibody'].unique():
    antibody_df = fold_changes.query("frac_models>=.5").query("antibody==@antibody")

    print(f"{antibody}:")
    slope, intercept, r_value, p_value, std_err = stats.linregress(
        antibody_df["predicted_fold_change"].astype(float),
        antibody_df["measured_fold_change"].astype(float))
    print(f"Predicted fold change correlation (R^2): {round(r_value**2,3)}")

fold_change_chart
1-18:
Predicted fold change correlation (R^2): 0.066
3BNC117:
Predicted fold change correlation (R^2): 0.634
IDC508:
Predicted fold change correlation (R^2): 0.82
IDC513:
Predicted fold change correlation (R^2): 0.371
IDC561:
Predicted fold change correlation (R^2): 0.006
IDF033:
Predicted fold change correlation (R^2): 0.766
[9]:

The following cells produce figures for the paper:

[10]:
# Figure for paper
PGT151_fold_changes = (
    validation_vs_prediction.query('antibody=="PGT151"')
    .rename(columns={"median_IC50": "predicted IC50"})
    .query("aa_substitutions != ''")
    [["antibody",
      "aa_substitutions",
      "measured IC50",
      "MAP_fold_enrichment",
      "predicted IC50",
      "max % neutralization",
      "times_seen",
      "frac_models",
      "n_models"]]
    .merge(
        validation_vs_prediction.query('antibody=="PGT151"')
        .rename(columns={"median_IC50": "predicted IC50"})
        .query("aa_substitutions == ''")
        [["antibody", "measured IC50", "predicted IC50"]],
        on="antibody",
        how="left",
        validate="many_to_one",
        suffixes=[" mutant", " unmutated"],
    )
    .assign(
        measured_fold_change=lambda x: x["measured IC50 mutant"] / x["measured IC50 unmutated"],
        predicted_fold_change=lambda x: x["predicted IC50 mutant"] / x["predicted IC50 unmutated"],
    )
)

antibody_df = PGT151_fold_changes.query("frac_models>=.5").query("antibody=='PGT151'")
antibody_df["MAP_fold_enrichment"] = antibody_df["MAP_fold_enrichment"].astype(float)
print(f"PGT151")
slope, intercept, r_value, p_value, std_err = stats.linregress(
    antibody_df["MAP_fold_enrichment"].astype(float),
    antibody_df["measured_fold_change"].astype(float))
print(f"MAP fold change correlation: {round(r_value**2,3)}")

corr_chart = (
    alt.Chart(antibody_df.query("frac_models>=.5"))
    .encode(
        x=alt.X(
            "measured_fold_change",
            title="measured fold change IC50",
            scale=alt.Scale(type="log", domain=(.4, 20)),
        ),
        y=alt.Y(
            "MAP_fold_enrichment",
            scale=alt.Scale(type="log", domain=(.4, 50)),
        ),
        facet=alt.Facet("antibody", columns=4, title=None),
        color=alt.Color("max % neutralization",
                        title="max % neutralization",
                        scale=alt.Scale(scheme='yelloworangered')),        tooltip=[
            alt.Tooltip(c, format=".3g") if antibody_df[c].dtype == float
            else c
            for c in antibody_df.columns.tolist()
        ],
    )
    .mark_circle(filled=True, size=60, opacity=0.6)
    .configure_axis(grid=False)
    .resolve_scale(y="independent", x="independent")
    .properties(width=150, height=150)
)

corr_chart
PGT151
MAP fold change correlation: 0.303
[10]:
[11]:
# Figure for paper
antibody_df = PGT151_fold_changes.query("frac_models>=.5").query("antibody=='PGT151'").query("aa_substitutions!=''")
antibody_df["MAP_fold_enrichment"] = antibody_df["MAP_fold_enrichment"].astype(float)
print("PGT151")
slope, intercept, r_value, p_value, std_err = stats.linregress(
    antibody_df["predicted_fold_change"].astype(float),
    antibody_df["measured_fold_change"].astype(float))
print(f"MAP fold change correlation: {round(r_value**2,3)}")

corr_chart = (
    alt.Chart(antibody_df.query("frac_models>=.5"))
    .encode(
        x=alt.X(
            "measured_fold_change",
            title="measured fold change IC50",
            scale=alt.Scale(type="log", domain=(.4, 20)),
        ),
        y=alt.Y(
            "predicted_fold_change",
            scale=alt.Scale(type="log", domain=(.4, 50)),
        ),
        facet=alt.Facet("antibody", columns=4, title=None),
        color=alt.Color("max % neutralization",
                        title="max % neutralization",
                        scale=alt.Scale(scheme='yelloworangered')),        tooltip=[
            alt.Tooltip(c, format=".3g") if antibody_df[c].dtype == float
            else c
            for c in antibody_df.columns.tolist()
        ],
    )
    .mark_circle(filled=True, size=60, opacity=0.6)
    .configure_axis(grid=False)
    .resolve_scale(y="independent", x="independent")
    .properties(width=150, height=150)
)

corr_chart
PGT151
MAP fold change correlation: 0.813
[11]:
[12]:
# figure for paper
figure_palette = ['#999999', '#0072B2',  '#E69F00', '#F0E442', '#009E73','#56B4E9', "#D55E00", "#CC79A7", '#9F0162']


plot_data = fold_changes.query("frac_models>=.5").query('antibody not in ["PGT151", "1-18", "3BNC117"]')
plot_data = plot_data[~plot_data['aa_substitutions'].str.contains(" ")]

for antibody in plot_data['antibody'].unique():
    antibody_df = plot_data.query("frac_models>=.5").query("antibody==@antibody")

    print(f"{antibody}:")
    slope, intercept, r_value, p_value, std_err = stats.linregress(
        antibody_df["predicted_fold_change"].astype(float),
        antibody_df["measured_fold_change"].astype(float))
    print(f"Predicted fold change correlation (R): {round(r_value,3)}")

fold_change_chart = (
    alt.Chart(plot_data.query("frac_models>=.5"))
    .encode(
        x=alt.X(
            "measured_fold_change",
            title=None,
            scale=alt.Scale(type="log", nice=False),
            axis=alt.Axis(tickCount=3),
        ),
        y=alt.Y(
            "predicted_fold_change",
            title=None,
            scale=alt.Scale(type="log", nice=False),
            axis=alt.Axis(tickCount=4),
        ),
        facet=alt.Facet("antibody", columns=4, title=None),
        color=alt.Color("aa_substitutions",
                        title="Amino acid substitutions",
                        scale=alt.Scale(range=figure_palette)),
        tooltip=[
            alt.Tooltip(c, format=".3g") if plot_data[c].dtype == float
            else c
            for c in plot_data.columns.tolist()
        ],
    )
    .mark_circle(filled=True, size=100, opacity=1, stroke='black', strokeWidth=1)
    .configure_axis(grid=False)
    .resolve_scale(y="independent", x="independent")
    .properties(width=125, height=125)
)
fold_change_chart
IDC508:
Predicted fold change correlation (R): 0.766
IDC513:
Predicted fold change correlation (R): 0.819
IDC561:
Predicted fold change correlation (R): 0.507
IDF033:
Predicted fold change correlation (R): 0.906
[12]:
[13]:
plot_data = fold_changes.query("frac_models>=.5").query('antibody not in ["PGT151", "1-18", "3BNC117"]')
virus_list = ["",
              "T198D",
              "N276D",
              "G459D",
              "T198D N276D",
              "T198D G459D",
              "N276D G459D",
              "T198D N276D G459D"]
plot_data = plot_data.query('aa_substitutions in @virus_list')

figure_palette = ['#999999', '#0072B2',  '#F0E442', '#E69F00', "#D55E00", '#009E73', '#56B4E9',  "#CC79A7", '#9F0162']

for antibody in plot_data['antibody'].unique():
    antibody_df = plot_data.query("frac_models>=.5").query("antibody==@antibody")

    print(f"{antibody}:")
    slope, intercept, r_value, p_value, std_err = stats.linregress(
        antibody_df["predicted_fold_change"].astype(float),
        antibody_df["measured_fold_change"].astype(float))
    print(f"Predicted fold change correlation (R): {round(r_value,3)}")

fold_change_chart = (
    alt.Chart(plot_data.query("frac_models>=.5"))
    .encode(
        x=alt.X(
            "measured_fold_change",
            title=None,
            scale=alt.Scale(type="log", nice=False),
            axis=alt.Axis(tickCount=3),
        ),
        y=alt.Y(
            "predicted_fold_change",
            title=None,
            scale=alt.Scale(type="log", nice=False),
            axis=alt.Axis(tickCount=5),
        ),
        facet=alt.Facet("antibody", columns=4, title=None),
        color=alt.Color("aa_substitutions",
                        title="Amino acid substitutions",
                        scale=alt.Scale(range=figure_palette)),
        tooltip=[
            alt.Tooltip(c, format=".3g") if plot_data[c].dtype == float
            else c
            for c in plot_data.columns.tolist()
        ],
    )
    .mark_circle(filled=True, size=100, opacity=1, stroke='black', strokeWidth=1)
    .configure_axis(grid=False)
    .resolve_scale(y="independent", x="independent")
    .properties(width=125, height=125)
)
fold_change_chart
IDC508:
Predicted fold change correlation (R): 0.931
IDC513:
Predicted fold change correlation (R): 0.706
IDC561:
Predicted fold change correlation (R): 0.227
IDF033:
Predicted fold change correlation (R): 0.858
[13]:
[ ]:

[ ]: