Probabilities of escape and neutralization in antibody selections

This notebook analyzes the probabilities of escape and neutralization in antibody selections.

Import Python modules:

[1]:
import os

import altair as alt

import dms_variants.codonvarianttable

import pandas as pd

import yaml
[2]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

Get configuration information:

[3]:
# If you are running notebook interactively rather than in pipeline that handles
# working directories, you may have to first `os.chdir` to appropriate directory.

with open("config.yaml") as f:
    config = yaml.safe_load(f)

Read the escape probabilities and neutralization information

Get the antibody selections data frame:

[4]:
antibody_selections = pd.read_csv(
    config["antibody_selections"],
    dtype={"antibody": str},
)

Read in the probabilities of escape and neutralization information:

[5]:
selection_groups = antibody_selections["selection_group"].unique()

prob_escape = pd.concat(
    [
        pd.read_csv(
            os.path.join(
                config["prob_escape_dir"], f"{selection_group}_prob_escape.csv"
            ),
            keep_default_na=False,
            na_values="nan",
        )
        for selection_group in selection_groups
    ],
    ignore_index=True,
)

neut_standard_fracs = pd.concat(
    [
        pd.read_csv(
            os.path.join(
                config["prob_escape_dir"], f"{selection_group}_neut_standard_fracs.csv"
            ),
            na_filter=None,
            dtype={"antibody": str},
        )
        for selection_group in selection_groups
    ],
    ignore_index=True,
).merge(
    antibody_selections.drop(columns="antibody_concentration"),  # can be rounding
    how="left",
    validate="one_to_one",
)
assert neut_standard_fracs.notnull().all().all()

neutralization = pd.concat(
    [
        pd.read_csv(
            os.path.join(
                config["prob_escape_dir"], f"{selection_group}_neutralization.csv"
            ),
            na_filter=None,
        )
        for selection_group in selection_groups
    ],
    ignore_index=True,
)

# sample order for plotting
sample_order = antibody_selections.sort_values(
    [
        "library",
        "date",
        "virus_batch",
        "antibody",
        "replicate",
        "antibody_concentration",
    ],
)["antibody_library_sample"].tolist()

Fraction of barcode counts for neutralization standard

Make an interactive plot showing this fraction of counts that are neutralization standard for each antibody sample and its no-antibody control. Note the fractions are plotted on a symlog scale:

[6]:
# make tidy version of neut_standard_fracs
melt_cols = ["antibody_frac", "no-antibody_frac"]
neut_standard_fracs_tidy = neut_standard_fracs.melt(
    id_vars=[c for c in neut_standard_fracs.columns if c not in melt_cols],
    value_vars=melt_cols,
    value_name="neut_standard_frac",
    var_name="sample_type",
).assign(
    sample_type=lambda x: x["sample_type"].str.replace("_frac", ""),
    library_sample=lambda x: x["library"] + "_" + x["antibody_sample"],
)

# set up selections over other columns of interest
selection_names = ["library", "date", "antibody", "virus_batch"]
selections = [
    alt.selection_point(
        fields=[col],
        bind=alt.binding_select(
            options=[None] + neut_standard_fracs_tidy[col].unique().tolist(),
            labels=["all"] + [str(x) for x in neut_standard_fracs_tidy[col].unique()],
            name=col,
        ),
    )
    for col in selection_names
]

neut_standard_fracs_chart = (
    alt.Chart(neut_standard_fracs_tidy)
    .encode(
        x=alt.X(
            "neut_standard_frac",
            title="neutralization standard fraction",
            scale=alt.Scale(type="symlog", constant=0.02, domainMax=1),
        ),
        y=alt.Y("library_sample", title=None, sort=sample_order),
        color="sample_type",
        shape="sample_type",
        tooltip=[
            alt.Tooltip(c, format=".2g") if c == "neut_standard_frac" else c
            for c in neut_standard_fracs_tidy.columns
            if "library_sample" not in c
        ],
    )
    .mark_point()
    .properties(width=275, height=alt.Step(14))
    .add_params(*selections)
    .configure_axis(labelLimit=500)
)
for selection in selections:
    neut_standard_fracs_chart = neut_standard_fracs_chart.transform_filter(selection)

neut_standard_fracs_chart
[6]:

Overall fraction neutralization by number of mutations

Plot neutralization (1 - probability of escape) averaged over all variants with a given number of mutations) for just the primary target. First, get the neutralization data ready to plot by just getting data for primary target, melting, and doing a few other transformations:

[7]:
cols_to_melt = ["prob_escape", "prob_escape_uncensored"]
assert "target" not in neutralization.columns, "only implemented for primary target"
neut_to_plot = (
    neutralization.melt(
        id_vars=[c for c in neutralization.columns if c not in cols_to_melt],
        var_name="censored",
        value_name="fraction not neutralized",
    )
    .assign(
        library_sample=lambda x: x["library"] + "_" + x["antibody_sample"],
        censored=lambda x: (x["censored"] == "prob_escape").map(
            {True: "yes", False: "no"}
        ),
        n_aa_substitutions=lambda x: (
            x["n_aa_substitutions"].map(
                lambda n: f">{n - 1}" if n == x["n_aa_substitutions"].max() else str(n)
            )
        ),
    )
    .rename(
        columns={
            c: c.replace("_", " ")
            for c in neutralization.columns
            if c.endswith("count")
        }
    )
    .merge(antibody_selections, validate="many_to_one", how="left")
)

Now make the plot. You can mouseover points for details, use the dropdown to subset samples, click on the legend to show only specific number of amino-acid substitutions or only censored (to [0, 1]) values or raw values. You should get worried if the censored and uncensored values look dramatically different:

[8]:
censored_selection = alt.selection_point(
    fields=["censored"],
    bind="legend",
)

n_aa_substitutions_selection = alt.selection_point(
    fields=["n_aa_substitutions"],
    bind="legend",
)

neutralization_chart = (
    alt.Chart(neut_to_plot)
    .encode(
        x=alt.X("fraction not neutralized"),
        y=alt.Y("library_sample", title=None),
        color=alt.Color(
            "n_aa_substitutions:N",
            title="amino-acid mutations",
            scale=alt.Scale(domain=neut_to_plot["n_aa_substitutions"].unique()),
        ),
        shape=alt.Shape(
            "censored:N",
            title="censored between 0 & 1",
            scale=alt.Scale(
                domain=neut_to_plot["censored"].unique(),
                range=["triangle-up", "triangle-down"],
            ),
        ),
        tooltip=[
            alt.Tooltip(c, format=".3g")
            if c.endswith("_count")
            or c.startswith("fraction")
            or c == "antibody_concentration"
            else c
            for c in neut_to_plot.columns
            if "library_sample" not in c
        ],
    )
    .mark_point(filled=True, size=50, opacity=0.7)
    .properties(width=300, height=alt.Step(14))
    .add_params(censored_selection, n_aa_substitutions_selection, *selections)
    .transform_filter(censored_selection)
    .transform_filter(n_aa_substitutions_selection)
    .configure_axis(labelLimit=500)
)
for selection in selections:
    neutralization_chart = neutralization_chart.transform_filter(selection)

neutralization_chart
[8]:

Variants with sufficient no-antibody counts

We only calculate the probability of escape for variants with some minimum threshold of no-antibody counts.

Draw a boxplot for the no-antibody samples with a black line at the median, boxes spanning the 25th to 75th percentiles, lines spanning the minimum to the maximum, and a red line indciating the threshold. You can mouseover bars for details and use the dropdown selections to just show certain subsets. Note that y-axis uses a symlog scale:

[9]:
# get data to plot
no_antibody_count_boxplot_df = (
    prob_escape[
        [
            "library",
            "no-antibody_sample",
            "no-antibody_count",
            "barcode",
            "no_antibody_count_threshold",
        ]
    ]
    .drop_duplicates()
    .assign(
        variant_above_threshold=lambda x: (
            x["no-antibody_count"] >= x["no_antibody_count_threshold"]
        ).astype(int),
        count_above_threshold=lambda x: x["no-antibody_count"]
        * x["variant_above_threshold"],
    )
    .groupby(
        ["library", "no-antibody_sample", "no_antibody_count_threshold"], as_index=False
    )
    .aggregate(
        median=pd.NamedAgg("no-antibody_count", "median"),
        percentile_25=pd.NamedAgg("no-antibody_count", lambda s: s.quantile(0.25)),
        percentile_75=pd.NamedAgg("no-antibody_count", lambda s: s.quantile(0.75)),
        min=pd.NamedAgg("no-antibody_count", "min"),
        max=pd.NamedAgg("no-antibody_count", "max"),
        nvariants=pd.NamedAgg("barcode", "count"),
        total_counts=pd.NamedAgg("no-antibody_count", "sum"),
        variants_above_threshold=pd.NamedAgg("variant_above_threshold", "sum"),
        counts_above_threshold=pd.NamedAgg("count_above_threshold", "sum"),
    )
    .assign(
        library_sample=lambda x: x["library"] + " " + x["no-antibody_sample"],
        frac_counts_above_threshold=lambda x: x["counts_above_threshold"]
        / x["total_counts"],
        frac_variants_above_threshold=lambda x: x["variants_above_threshold"]
        / x["nvariants"],
    )
    .drop(
        columns=[
            "counts_above_threshold",
            "variants_above_threshold",
            "total_counts",
            "nvariants",
        ]
    )
    .merge(
        antibody_selections[
            ["no-antibody_sample", "virus_batch", "date", "library"]
        ].drop_duplicates(),
        how="left",
        validate="one_to_one",
    )
)
assert (
    len(no_antibody_count_boxplot_df)
    == no_antibody_count_boxplot_df["library_sample"].nunique()
)

# make plot
no_antibody_count_base = alt.Chart(no_antibody_count_boxplot_df).encode(
    y=alt.Y("library_sample", title=None),
    tooltip=[
        alt.Tooltip(c, format=".2g")
        if no_antibody_count_boxplot_df[c].dtype == float
        else c
        for c in no_antibody_count_boxplot_df.columns
        if c != "library_sample"
    ],
)

no_antibody_count_quartile_bars = no_antibody_count_base.encode(
    alt.X(
        "percentile_25",
        scale=alt.Scale(type="symlog", constant=20),
        title="counts for variant",
    ),
    alt.X2("percentile_75"),
).mark_bar(color="blue")

no_antibody_count_range_lines = no_antibody_count_base.encode(
    alt.X("min"),
    alt.X2("max"),
).mark_rule(color="blue", opacity=0.5)

no_antibody_count_median_lines = no_antibody_count_base.encode(
    alt.X("median"), alt.X2("median")
).mark_bar(xOffset=1, x2Offset=-1, color="black")

no_antibody_count_threshold = no_antibody_count_base.encode(
    alt.X("no_antibody_count_threshold"), alt.X2("no_antibody_count_threshold")
).mark_bar(xOffset=1, x2Offset=-1, color="red")

no_antibody_count_chart = (
    (
        no_antibody_count_quartile_bars
        + no_antibody_count_range_lines
        + no_antibody_count_median_lines
        + no_antibody_count_threshold
    )
    .configure_axis(labelLimit=500)
    .properties(width=350, height=alt.Step(14))
)

for s, name in zip(selections, selection_names):
    if name != "antibody":
        no_antibody_count_chart = no_antibody_count_chart.add_params(
            s
        ).transform_filter(s)

no_antibody_count_chart
[9]:

Plot the fraction of all variants, and fraction of all counts, that are above the thresholds for the no-antibody conditions:

[10]:
frac_counts_df = no_antibody_count_boxplot_df.drop(
    columns=[
        "median",
        "no_antibody_count_threshold",
        "min",
        "max",
        "percentile_25",
        "percentile_75",
    ],
).rename(
    columns={
        "frac_counts_above_threshold": "counts",
        "frac_variants_above_threshold": "variants",
    }
)

fraction_type_selection = alt.selection_point(
    fields=["fraction_type"],
    bind="legend",
)

frac_counts_chart = (
    alt.Chart(frac_counts_df)
    .transform_fold(fold=["counts", "variants"], as_=["fraction_type", "fraction"])
    .encode(
        y=alt.Y("library_sample", title=None),
        tooltip=[
            alt.Tooltip(c, format=".2g") if frac_counts_df[c].dtype == float else c
            for c in frac_counts_df.columns
            if c != "library_sample"
        ],
        x=alt.X(
            "fraction:Q",
            title="fraction above threshold",
            scale=alt.Scale(domain=(0, 1)),
        ),
        color=alt.Color(
            "fraction_type:N",
            title=None,
            scale=alt.Scale(domain=["counts", "variants"]),
        ),
    )
    .mark_point(filled=True, size=50)
    .properties(width=200, height=alt.Step(14))
    .configure_axis(labelLimit=500)
    .add_params(fraction_type_selection)
    .transform_filter(fraction_type_selection)
)

for s, name in zip(selections, selection_names):
    if name != "antibody":
        frac_counts_chart = frac_counts_chart.add_params(s).transform_filter(s)

frac_counts_chart
[10]:

Now just get probability of escape measurements for variants that exceed no-antibody count threshold:

[11]:
prob_escape_filtered = prob_escape.query(
    "`no-antibody_count` >= no_antibody_count_threshold"
)

Distribution of probability escape across variants

Plot distribution of prob escape values across non-filtered variants. The boxes span 25th to 75th percentile, the black vertical line is at the median, and the thin lines extend from the min to max. You can show either the censored (to between 0 and 1) or uncensored probabilities of escape. Note the plot uses a symlog scale:

[12]:
# get data to plot
prob_escape_boxplot_df = (
    prob_escape_filtered.melt(
        id_vars=["library", "antibody_sample", "no-antibody_sample"],
        value_vars=["prob_escape", "prob_escape_uncensored"],
        var_name="censored",
        value_name="probability escape",
    )
    .groupby(
        ["library", "antibody_sample", "no-antibody_sample", "censored"],
        as_index=False,
    )
    .aggregate(
        median=pd.NamedAgg("probability escape", "median"),
        percentile_25=pd.NamedAgg("probability escape", lambda s: s.quantile(0.25)),
        percentile_75=pd.NamedAgg("probability escape", lambda s: s.quantile(0.75)),
        min=pd.NamedAgg("probability escape", "min"),
        max=pd.NamedAgg("probability escape", "max"),
    )
    .assign(censored=lambda x: (x["censored"] == "prob_escape"))
    .merge(
        antibody_selections[
            [
                "antibody_sample",
                "antibody_library_sample",
                "virus_batch",
                "date",
                "library",
                "antibody",
            ]
        ],
        how="left",
        on=["library", "antibody_sample"],
        validate="many_to_one",
    )
)
assert (
    len(prob_escape_boxplot_df)
    == 2 * prob_escape_boxplot_df["antibody_library_sample"].nunique()
)

# make plot
prob_escape_base = alt.Chart(prob_escape_boxplot_df).encode(
    y=alt.Y("antibody_library_sample", title=None, sort=sample_order),
    tooltip=[
        alt.Tooltip(c, format=".3g")
        if prob_escape_boxplot_df[c].dtype == "float"
        else c
        for c in prob_escape_boxplot_df.columns
        if c != "antibody_library_sample"
    ],
)

prob_escape_quartile_bars = prob_escape_base.encode(
    alt.X(
        "percentile_25",
        scale=alt.Scale(type="symlog", constant=0.25),
        title="probability escape",
    ),
    alt.X2("percentile_75"),
).mark_bar(color="blue")

prob_escape_range_lines = prob_escape_base.encode(
    alt.X("min"),
    alt.X2("max"),
).mark_rule(color="blue", opacity=0.5)

prob_escape_median_lines = prob_escape_base.encode(
    alt.X("median"), alt.X2("median")
).mark_bar(xOffset=1, x2Offset=-1, color="black")

prob_escape_chart = (
    (prob_escape_quartile_bars + prob_escape_range_lines + prob_escape_median_lines)
    .configure_axis(labelLimit=500)
    .properties(width=350, height=alt.Step(14))
)

censored_dropdown_selection = alt.selection_point(
    fields=["censored"],
    value=[{"censored": True}],
    bind=alt.binding_select(
        options=prob_escape_boxplot_df["censored"].unique().tolist(),
        name="censored",
    ),
)

for s in list(selections) + [censored_dropdown_selection]:
    prob_escape_chart = prob_escape_chart.add_params(s).transform_filter(s)

prob_escape_chart
[12]:

Correlations in variant-level escape probabilities

Analyze correlations of escape probabilities of different variants in same library:

[13]:
assert len(
    prob_escape_filtered.groupby(["library", "antibody_sample", "no-antibody_sample"])
) == len(prob_escape_filtered.groupby(["library", "antibody_sample"]))

prob_escape_tidy = (
    prob_escape_filtered.rename(
        columns={"prob_escape": "censored", "prob_escape_uncensored": "uncensored"}
    )
    .melt(
        id_vars=["antibody_sample", "barcode", "library"],
        value_vars=["censored", "uncensored"],
        value_name="prob_escape",
        var_name="censored",
    )
    .assign(
        censored=lambda x: x["censored"].map({"censored": True, "uncensored": False})
    )
)

corrs = (
    dms_variants.utils.tidy_to_corr(
        df=prob_escape_tidy,
        sample_col="antibody_sample",
        label_col="barcode",
        value_col="prob_escape",
        group_cols=["library", "censored"],
    )
    .assign(r2=lambda x: x["correlation"] ** 2)
    .drop(columns="correlation")
)

# add other properties
suffixes = ["_1", "_2"]
for suffix in suffixes:
    corrs = corrs.merge(
        antibody_selections,
        left_on=["library", f"antibody_sample{suffix}"],
        right_on=["library", "antibody_sample"],
        validate="many_to_one",
        suffixes=suffixes,
    ).drop(
        columns=[
            "antibody_library_sample",
            "no-antibody_library_sample",
            "no-antibody_sample",
            "antibody_sample",
            "replicate",
        ]
    )

# make columns identical for both samples drop suffix for selections
for col in antibody_selections.columns:
    if (
        f"{col}_1" in corrs.columns
        and f"{col}_2" in corrs.columns
        and col != "antibody_sample"
    ):
        equal = corrs[f"{col}_1"] == corrs[f"{col}_2"]
        corrs[col] = corrs[f"{col}_1"].where(equal, pd.NA)
        corrs = corrs.drop(columns=[f"{col}{suffix}" for suffix in suffixes])
/fh/fast/bloom_j/computational_notebooks/ceradfor/2023/HIV_Envelope_BF520_DMS_CD4bs_sera/.snakemake/conda/056c8fe5fcb5561ce39a818628b75df6_/lib/python3.11/site-packages/dms_variants/utils.py:360: FutureWarning: The default value of numeric_only in DataFrameGroupBy.corr is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
[14]:
for library, library_corr in corrs.groupby("library"):
    corr_chart = (
        alt.Chart(library_corr)
        .encode(
            alt.X("antibody_sample_1", title=None),
            alt.Y("antibody_sample_2", title=None),
            color=alt.Color("r2", scale=alt.Scale(zero=True)),
            tooltip=[
                alt.Tooltip(c, format=".3g") if c == "r2" else c
                for c in ["library", "antibody_sample_1", "antibody_sample_2", "r2"]
            ],
        )
        .mark_rect(stroke="black")
        .properties(width=alt.Step(15), height=alt.Step(15), title=library)
        .configure_axis(labelLimit=500)
    )
    for s, name in list(zip(selections, selection_names)) + [
        (censored_dropdown_selection, "censored")
    ]:
        if name != "library":
            corr_chart = corr_chart.add_params(s).transform_filter(s)
    display(corr_chart)