Validation neutralization assays versus polyclonal
fits¶
Compare actual measured neutralization values for specific mutants to the polyclonal
fits.
Import Python modules:
[1]:
import os
import pickle
import altair as alt
import pandas as pd
import numpy as np
import yaml
from scipy import stats
import warnings
warnings.simplefilter("ignore")
palette = ['#999999', '#0072B2', '#E69F00', '#F0E442', '#009E73','#56B4E9', "#D55E00", "#CC79A7"]
extended_palette = ['#999999', '#0072B2', '#E69F00', '#F0E442', '#009E73','#56B4E9', "#D55E00", "#CC79A7", '#9F0162']
long_palette = ['#9F0162', '#009F81', '#FF5AAF', '#8400CD', '#008DF9', '#00C2F9', '#FFB2FD', '#A40122', '#E20134', '#FF6E3A', '#FFC33B', '#00FCCF']
Read configuration and validation assay measurements:
[2]:
with open("config.yaml") as f:
config = yaml.safe_load(f)
validation_ic50s = pd.read_csv(config["validation_ics"], na_filter=None)
validation_ic50s['max % neutralization'] = np.clip(validation_ic50s['max % neutralization'], a_min=0, a_max=100)
validation_ic50s
[2]:
antibody | aa_substitutions | max % neutralization | measured IC50 | MAP_fold_enrichment | measured IC90 | measured IC80 | |
---|---|---|---|---|---|---|---|
0 | PGT151 | 100 | 0.065000 | ||||
1 | PGT151 | N611S | 98 | 0.187000 | 5.4 | ||
2 | PGT151 | S612P | 81 | 0.150000 | 8 | ||
3 | PGT151 | N637A | 81 | 0.046000 | 1 | ||
4 | PGT151 | N637K | 59 | 0.092000 | 3.7 | ||
... | ... | ... | ... | ... | ... | ... | ... |
88 | 3BNC117 | G459D | 100 | 0.009945 | 0.043379 | 0.025188 | |
89 | 3BNC117 | R476M | 100 | 0.002286 | 0.010977 | 0.005776 | |
90 | 3BNC117 | T202P | 100 | 0.011801 | 0.086449 | 0.041454 | |
91 | 3BNC117 | N463S | 100 | 0.014342 | 0.061920 | 0.036090 | |
92 | 3BNC117 | Q203P | 100 | 0.004815 | 0.036248 | 0.017207 |
93 rows × 7 columns
Now get the predictions by the averaged polyclonal
model fits:
[3]:
validation_vs_prediction = []
for antibody, antibody_df in validation_ic50s.groupby("antibody"):
with open(os.path.join(config["escape_dir"], f"{antibody}.pickle"), "rb") as f:
model = pickle.load(f)
df = model.icXX(antibody_df)
df = model.icXX(df, x=0.80, col="IC80")
df = df.merge((model.mut_escape_df
.rename(columns={'mutation': 'aa_substitutions'})
[['aa_substitutions', 'times_seen']]
), how='left', on='aa_substitutions')
validation_vs_prediction.append(df)
validation_vs_prediction = pd.concat(validation_vs_prediction, ignore_index=True)
validation_vs_prediction
[3]:
antibody | aa_substitutions | max % neutralization | measured IC50 | MAP_fold_enrichment | measured IC90 | measured IC80 | mean_IC50 | median_IC50 | std_IC50 | frac_models | mean_IC80 | median_IC80 | std_IC80 | n_models | times_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1-18 | 100 | 0.012082 | 0.082944 | 0.040739 | 0.144254 | 0.148228 | 0.044315 | 1.0 | 0.577016 | 0.592914 | 0.177258 | 3 | NaN | ||
1 | 1-18 | G459D | 100 | 0.004669 | 0.035790 | 0.016877 | 0.135608 | 0.140220 | 0.037858 | 1.0 | 0.542431 | 0.560881 | 0.151432 | 3 | 7.333333 | |
2 | 1-18 | N276D | 100 | 0.005727 | 0.014183 | 0.007466 | 0.121682 | 0.120510 | 0.029322 | 1.0 | 0.486727 | 0.482039 | 0.117286 | 3 | 5.666667 | |
3 | 1-18 | N276D G459D | 100 | 0.001909 | 0.007638 | 0.004369 | 0.114543 | 0.113999 | 0.024161 | 1.0 | 0.458173 | 0.455996 | 0.096642 | 3 | NaN | |
4 | 1-18 | N463S | 100 | 0.018143 | 0.096897 | 0.052213 | 0.170952 | 0.192581 | 0.063604 | 1.0 | 0.683809 | 0.770325 | 0.254416 | 3 | 3.666667 | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
93 | PGT151 | T639A | 78 | 0.049000 | 1.4 | 0.117905 | 0.133545 | 0.034327 | 1.0 | 0.471619 | 0.534181 | 0.137308 | 3 | 9.333333 | ||
94 | PGT151 | T639K | 74 | 0.041000 | 4.8 | 0.303773 | 0.286245 | 0.115182 | 1.0 | 1.215092 | 1.144980 | 0.460730 | 3 | 8.333333 | ||
95 | PGT151 | T639R | 79 | 0.042000 | 2.8 | 0.122822 | 0.133414 | 0.041905 | 1.0 | 0.491289 | 0.533657 | 0.167619 | 3 | 4.333333 | ||
96 | PGT151 | T644A | 96 | 0.052000 | 1 | 0.111485 | 0.125146 | 0.030082 | 1.0 | 0.445941 | 0.500585 | 0.120327 | 3 | 8.666667 | ||
97 | PGT151 | T644R | 87 | 0.094000 | 4 | 0.280587 | 0.318181 | 0.188654 | 1.0 | 1.122348 | 1.272726 | 0.754615 | 3 | 4.000000 |
98 rows × 16 columns
For each antibody, calculate the Pearson correlation coefficient between the predicted IC50s from our models and the IC50s measured in validation assays. We are doing this first for only single mutants:
[4]:
print("Single mutant correlations between DMS predicted and neutralization assay measured IC50s/IC80s:")
for antibody, antibody_df in validation_vs_prediction.groupby('antibody'):
if antibody == "PGT151":
antibody_df = antibody_df.query("frac_models>=.5").query("aa_substitutions!=''")
antibody_df = antibody_df[~antibody_df['aa_substitutions'].str.contains(" ")]
print(f"{antibody} (IC50):")
slope, intercept, r_value, p_value, std_err = stats.linregress(
antibody_df["median_IC50"].astype(float),
antibody_df["measured IC50"].astype(float))
print(round(r_value**2,3))
print("PGT151 MAP_fold_enrichment correlation:")
slope, intercept, r_value, p_value, std_err = stats.linregress(
antibody_df["MAP_fold_enrichment"].astype(float),
antibody_df["measured IC50"].astype(float))
print(round(r_value**2,3))
else:
antibody_df = antibody_df.query("frac_models>=.5").query("aa_substitutions!=''")
antibody_df = antibody_df[~antibody_df['aa_substitutions'].str.contains(" ")]
print(f"{antibody}:")
slope, intercept, r_value, p_value, std_err = stats.linregress(
antibody_df["median_IC80"].astype(float),
antibody_df["measured IC80"].astype(float))
print(round(r_value**2,3))
Single mutant correlations between DMS predicted and neutralization assay measured IC50s/IC80s:
1-18:
0.186
3BNC117:
0.606
IDC508:
0.577
IDC513:
0.681
IDC561:
0.362
IDF033:
0.804
PGT151 (IC50):
0.813
PGT151 MAP_fold_enrichment correlation:
0.303
Now, plot the results. We will plot the median across the replicate polyclonal fits to different deep mutational scanning replicates. This is an interactive plot that you can mouse over for details:
[5]:
plot_data = validation_vs_prediction.query("frac_models>=.5").query('antibody!="PGT151"')
plot_data = plot_data[~plot_data['aa_substitutions'].str.contains(" ")]
plot_data['measured IC80'] = plot_data['measured IC80'].astype(float)
corr_chart = (
alt.Chart(plot_data)
.encode(
x=alt.X("measured IC80", scale=alt.Scale(type="log", nice=False)),
y=alt.Y(
"median_IC80",
title="predicted IC80 from DMS",
scale=alt.Scale(type="log", nice=False),
),
facet=alt.Facet("antibody", columns=4, title=None),
color=alt.Color("aa_substitutions",
title="Amino acid substitutions",
scale=alt.Scale(scheme="tableau10")),
tooltip=[
alt.Tooltip(c, format=".3g") if validation_vs_prediction[c].dtype == float
else c
for c in validation_vs_prediction.columns.tolist()
],
)
.mark_circle(filled=True, size=60, opacity=1)
.configure_axis(grid=False)
.resolve_scale(y="independent", x="independent")
.properties(width=150, height=150)
)
corr_chart
[5]:
Now also calculate the fold changes, using the median prediction:
[6]:
sera_validation_vs_prediction = validation_vs_prediction.query('antibody!="PGT151"').copy()
sera_validation_vs_prediction['median_IC80'] = sera_validation_vs_prediction['median_IC80'].astype(float)
sera_validation_vs_prediction['measured IC80'] = sera_validation_vs_prediction['measured IC80'].astype(float)
fold_changes = (
sera_validation_vs_prediction
.rename(columns={"median_IC80": "predicted IC80"})
# .query("aa_substitutions != ''")
[["antibody",
"aa_substitutions",
"measured IC80",
"MAP_fold_enrichment",
"predicted IC80",
"max % neutralization",
"times_seen",
"frac_models",
"n_models"]]
.merge(
sera_validation_vs_prediction
.rename(columns={"median_IC80": "predicted IC80"})
.query("aa_substitutions == ''")
[["antibody", "measured IC80", "predicted IC80"]],
on="antibody",
how="left",
validate="many_to_one",
suffixes=[" mutant", " unmutated"],
)
.assign(
measured_fold_change=lambda x: x["measured IC80 mutant"] / x["measured IC80 unmutated"],
predicted_fold_change=lambda x: x["predicted IC80 mutant"] / x["predicted IC80 unmutated"],
)
)
plot_data = fold_changes.query("frac_models>=.5").query('antibody!="PGT151"')
plot_data = plot_data[~plot_data['aa_substitutions'].str.contains(" ")]
fold_change_chart = (
alt.Chart(plot_data.query("frac_models>=.5"))
.encode(
x=alt.X(
"measured_fold_change",
title="measured fold change IC80",
scale=alt.Scale(type="log", nice=False),
),
y=alt.Y(
"predicted_fold_change",
title="predicted fold change IC80",
scale=alt.Scale(type="log", nice=False),
),
facet=alt.Facet("antibody", columns=4, title=None),
color=alt.Color("aa_substitutions",
title="Amino acid substitutions",
scale=alt.Scale(scheme="tableau10")),
tooltip=[
alt.Tooltip(c, format=".3g") if plot_data[c].dtype == float
else c
for c in plot_data.columns.tolist()
],
)
.mark_circle(filled=True, size=60, opacity=1)
.configure_axis(grid=False)
.resolve_scale(y="independent", x="independent")
.properties(width=150, height=150)
)
for antibody in plot_data['antibody'].unique():
antibody_df = fold_changes.query("frac_models>=.5").query("antibody==@antibody")
antibody_df = antibody_df[~antibody_df['aa_substitutions'].str.contains(" ")]
print(f"{antibody}:")
slope, intercept, r_value, p_value, std_err = stats.linregress(
antibody_df["predicted_fold_change"].astype(float),
antibody_df["measured_fold_change"].astype(float))
print(f"Predicted fold change correlation (R^2): {round(r_value**2,3)}")
fold_change_chart
1-18:
Predicted fold change correlation (R^2): 0.184
3BNC117:
Predicted fold change correlation (R^2): 0.614
IDC508:
Predicted fold change correlation (R^2): 0.587
IDC513:
Predicted fold change correlation (R^2): 0.671
IDC561:
Predicted fold change correlation (R^2): 0.257
IDF033:
Predicted fold change correlation (R^2): 0.82
[6]:
And now repeat these calculation and plots, this time including multi-mutants:
[7]:
print("All mutant correlations between DMS predicted and neutralization assay measured IC80s:")
for antibody, antibody_df in sera_validation_vs_prediction.groupby('antibody'):
antibody_df = antibody_df.query("frac_models>=.5").query("aa_substitutions!=''")
print(f"{antibody}:")
slope, intercept, r_value, p_value, std_err = stats.linregress(
antibody_df["median_IC80"].astype(float),
antibody_df["measured IC80"].astype(float))
print(round(r_value**2,3))
if antibody == "PGT151":
print("PGT151 MAP_fold_enrichment correlation:")
slope, intercept, r_value, p_value, std_err = stats.linregress(
antibody_df["MAP_fold_enrichment"].astype(float),
antibody_df["measured IC50"].astype(float))
print(round(r_value**2,3))
All mutant correlations between DMS predicted and neutralization assay measured IC80s:
1-18:
0.074
3BNC117:
0.635
IDC508:
0.813
IDC513:
0.377
IDC561:
0.037
IDF033:
0.76
[8]:
plot_data = sera_validation_vs_prediction.query("frac_models>=.5").query('antibody!="PGT151"')
corr_chart = (
alt.Chart(plot_data)
.encode(
x=alt.X("measured IC80", scale=alt.Scale(type="log", nice=False)),
y=alt.Y(
"median_IC80",
title="predicted IC80 from DMS",
scale=alt.Scale(type="log", nice=False),
),
facet=alt.Facet("antibody", columns=4, title=None),
color=alt.Color("aa_substitutions",
title="Amino acid substitutions",
scale=alt.Scale(scheme="tableau20")),
tooltip=[
alt.Tooltip(c, format=".3g") if validation_vs_prediction[c].dtype == float
else c
for c in validation_vs_prediction.columns.tolist()
],
)
.mark_circle(filled=True, size=60, opacity=1)
.configure_axis(grid=False)
.resolve_scale(y="independent", x="independent")
.properties(width=150, height=150)
)
corr_chart
[8]:
[9]:
plot_data = fold_changes.query("frac_models>=.5").query('antibody!="PGT151"')
fold_change_chart = (
alt.Chart(plot_data.query("frac_models>=.5"))
.encode(
x=alt.X(
"measured_fold_change",
title="measured fold change IC80",
scale=alt.Scale(type="log", nice=False),
),
y=alt.Y(
"predicted_fold_change",
title="predicted fold change IC80",
scale=alt.Scale(type="log", nice=False),
),
facet=alt.Facet("antibody", columns=4, title=None),
color=alt.Color("aa_substitutions",
title="Amino acid substitutions",
scale=alt.Scale(scheme="tableau20")),
tooltip=[
alt.Tooltip(c, format=".3g") if plot_data[c].dtype == float
else c
for c in plot_data.columns.tolist()
],
)
.mark_circle(filled=True, size=60, opacity=1)
.configure_axis(grid=False)
.resolve_scale(y="independent", x="independent")
.properties(width=150, height=150)
)
for antibody in plot_data['antibody'].unique():
antibody_df = fold_changes.query("frac_models>=.5").query("antibody==@antibody")
print(f"{antibody}:")
slope, intercept, r_value, p_value, std_err = stats.linregress(
antibody_df["predicted_fold_change"].astype(float),
antibody_df["measured_fold_change"].astype(float))
print(f"Predicted fold change correlation (R^2): {round(r_value**2,3)}")
fold_change_chart
1-18:
Predicted fold change correlation (R^2): 0.066
3BNC117:
Predicted fold change correlation (R^2): 0.634
IDC508:
Predicted fold change correlation (R^2): 0.82
IDC513:
Predicted fold change correlation (R^2): 0.371
IDC561:
Predicted fold change correlation (R^2): 0.006
IDF033:
Predicted fold change correlation (R^2): 0.766
[9]:
The following cells produce figures for the paper:
[10]:
# Figure for paper
PGT151_fold_changes = (
validation_vs_prediction.query('antibody=="PGT151"')
.rename(columns={"median_IC50": "predicted IC50"})
.query("aa_substitutions != ''")
[["antibody",
"aa_substitutions",
"measured IC50",
"MAP_fold_enrichment",
"predicted IC50",
"max % neutralization",
"times_seen",
"frac_models",
"n_models"]]
.merge(
validation_vs_prediction.query('antibody=="PGT151"')
.rename(columns={"median_IC50": "predicted IC50"})
.query("aa_substitutions == ''")
[["antibody", "measured IC50", "predicted IC50"]],
on="antibody",
how="left",
validate="many_to_one",
suffixes=[" mutant", " unmutated"],
)
.assign(
measured_fold_change=lambda x: x["measured IC50 mutant"] / x["measured IC50 unmutated"],
predicted_fold_change=lambda x: x["predicted IC50 mutant"] / x["predicted IC50 unmutated"],
)
)
antibody_df = PGT151_fold_changes.query("frac_models>=.5").query("antibody=='PGT151'")
antibody_df["MAP_fold_enrichment"] = antibody_df["MAP_fold_enrichment"].astype(float)
print(f"PGT151")
slope, intercept, r_value, p_value, std_err = stats.linregress(
antibody_df["MAP_fold_enrichment"].astype(float),
antibody_df["measured_fold_change"].astype(float))
print(f"MAP fold change correlation: {round(r_value**2,3)}")
corr_chart = (
alt.Chart(antibody_df.query("frac_models>=.5"))
.encode(
x=alt.X(
"measured_fold_change",
title="measured fold change IC50",
scale=alt.Scale(type="log", domain=(.4, 20)),
),
y=alt.Y(
"MAP_fold_enrichment",
scale=alt.Scale(type="log", domain=(.4, 50)),
),
facet=alt.Facet("antibody", columns=4, title=None),
color=alt.Color("max % neutralization",
title="max % neutralization",
scale=alt.Scale(scheme='yelloworangered')), tooltip=[
alt.Tooltip(c, format=".3g") if antibody_df[c].dtype == float
else c
for c in antibody_df.columns.tolist()
],
)
.mark_circle(filled=True, size=60, opacity=0.6)
.configure_axis(grid=False)
.resolve_scale(y="independent", x="independent")
.properties(width=150, height=150)
)
corr_chart
PGT151
MAP fold change correlation: 0.303
[10]:
[11]:
# Figure for paper
antibody_df = PGT151_fold_changes.query("frac_models>=.5").query("antibody=='PGT151'").query("aa_substitutions!=''")
antibody_df["MAP_fold_enrichment"] = antibody_df["MAP_fold_enrichment"].astype(float)
print("PGT151")
slope, intercept, r_value, p_value, std_err = stats.linregress(
antibody_df["predicted_fold_change"].astype(float),
antibody_df["measured_fold_change"].astype(float))
print(f"MAP fold change correlation: {round(r_value**2,3)}")
corr_chart = (
alt.Chart(antibody_df.query("frac_models>=.5"))
.encode(
x=alt.X(
"measured_fold_change",
title="measured fold change IC50",
scale=alt.Scale(type="log", domain=(.4, 20)),
),
y=alt.Y(
"predicted_fold_change",
scale=alt.Scale(type="log", domain=(.4, 50)),
),
facet=alt.Facet("antibody", columns=4, title=None),
color=alt.Color("max % neutralization",
title="max % neutralization",
scale=alt.Scale(scheme='yelloworangered')), tooltip=[
alt.Tooltip(c, format=".3g") if antibody_df[c].dtype == float
else c
for c in antibody_df.columns.tolist()
],
)
.mark_circle(filled=True, size=60, opacity=0.6)
.configure_axis(grid=False)
.resolve_scale(y="independent", x="independent")
.properties(width=150, height=150)
)
corr_chart
PGT151
MAP fold change correlation: 0.813
[11]:
[12]:
# figure for paper
figure_palette = ['#999999', '#0072B2', '#E69F00', '#F0E442', '#009E73','#56B4E9', "#D55E00", "#CC79A7", '#9F0162']
plot_data = fold_changes.query("frac_models>=.5").query('antibody not in ["PGT151", "1-18", "3BNC117"]')
plot_data = plot_data[~plot_data['aa_substitutions'].str.contains(" ")]
for antibody in plot_data['antibody'].unique():
antibody_df = plot_data.query("frac_models>=.5").query("antibody==@antibody")
print(f"{antibody}:")
slope, intercept, r_value, p_value, std_err = stats.linregress(
antibody_df["predicted_fold_change"].astype(float),
antibody_df["measured_fold_change"].astype(float))
print(f"Predicted fold change correlation (R): {round(r_value,3)}")
fold_change_chart = (
alt.Chart(plot_data.query("frac_models>=.5"))
.encode(
x=alt.X(
"measured_fold_change",
title=None,
scale=alt.Scale(type="log", nice=False),
axis=alt.Axis(tickCount=3),
),
y=alt.Y(
"predicted_fold_change",
title=None,
scale=alt.Scale(type="log", nice=False),
axis=alt.Axis(tickCount=4),
),
facet=alt.Facet("antibody", columns=4, title=None),
color=alt.Color("aa_substitutions",
title="Amino acid substitutions",
scale=alt.Scale(range=figure_palette)),
tooltip=[
alt.Tooltip(c, format=".3g") if plot_data[c].dtype == float
else c
for c in plot_data.columns.tolist()
],
)
.mark_circle(filled=True, size=100, opacity=1, stroke='black', strokeWidth=1)
.configure_axis(grid=False)
.resolve_scale(y="independent", x="independent")
.properties(width=125, height=125)
)
fold_change_chart
IDC508:
Predicted fold change correlation (R): 0.766
IDC513:
Predicted fold change correlation (R): 0.819
IDC561:
Predicted fold change correlation (R): 0.507
IDF033:
Predicted fold change correlation (R): 0.906
[12]:
[13]:
plot_data = fold_changes.query("frac_models>=.5").query('antibody not in ["PGT151", "1-18", "3BNC117"]')
virus_list = ["",
"T198D",
"N276D",
"G459D",
"T198D N276D",
"T198D G459D",
"N276D G459D",
"T198D N276D G459D"]
plot_data = plot_data.query('aa_substitutions in @virus_list')
figure_palette = ['#999999', '#0072B2', '#F0E442', '#E69F00', "#D55E00", '#009E73', '#56B4E9', "#CC79A7", '#9F0162']
for antibody in plot_data['antibody'].unique():
antibody_df = plot_data.query("frac_models>=.5").query("antibody==@antibody")
print(f"{antibody}:")
slope, intercept, r_value, p_value, std_err = stats.linregress(
antibody_df["predicted_fold_change"].astype(float),
antibody_df["measured_fold_change"].astype(float))
print(f"Predicted fold change correlation (R): {round(r_value,3)}")
fold_change_chart = (
alt.Chart(plot_data.query("frac_models>=.5"))
.encode(
x=alt.X(
"measured_fold_change",
title=None,
scale=alt.Scale(type="log", nice=False),
axis=alt.Axis(tickCount=3),
),
y=alt.Y(
"predicted_fold_change",
title=None,
scale=alt.Scale(type="log", nice=False),
axis=alt.Axis(tickCount=5),
),
facet=alt.Facet("antibody", columns=4, title=None),
color=alt.Color("aa_substitutions",
title="Amino acid substitutions",
scale=alt.Scale(range=figure_palette)),
tooltip=[
alt.Tooltip(c, format=".3g") if plot_data[c].dtype == float
else c
for c in plot_data.columns.tolist()
],
)
.mark_circle(filled=True, size=100, opacity=1, stroke='black', strokeWidth=1)
.configure_axis(grid=False)
.resolve_scale(y="independent", x="independent")
.properties(width=125, height=125)
)
fold_change_chart
IDC508:
Predicted fold change correlation (R): 0.931
IDC513:
Predicted fold change correlation (R): 0.706
IDC561:
Predicted fold change correlation (R): 0.227
IDF033:
Predicted fold change correlation (R): 0.858
[13]:
[ ]:
[ ]: