Mutation effects compared to natural sequence variationΒΆ

This notebook commpares the effects of mutations on Env function of mediating entry into cells to their frequency in a natural sequence alignment. The alignment being used from the filtered web alignments from LANL of both DNA and amino acid HIV sequences. These alignments were used to determine the natural sequence variation of HIV Env relative to BF520 in the library design section of the analysis. They were downloaded on November 9th, 2020 from the curated alignment from LANL with the following settings:
- Alignment type: Filtered web
- Organism: HIV-1/SIVcpz
- Region: Env
- Subtype: M group without recombinants (A-K)
- DNA/Protein: DNA for DNA alignment, Protein for amino acid alignment
- Year: 2018
- Format: FASTA

Import python modules:

[1]:
import pandas as pd
import altair as alt
import numpy
import scipy
import yaml
import warnings
warnings.simplefilter('ignore')

Load the configuration file:

[2]:
with open("config.yaml") as f:
    config = yaml.safe_load(f)

Import, filter, and prepare the data:

[3]:
natural_effects_full = pd.read_csv(config['natural_sequence_data']).query('alignment=="full"').query('wildtype!=mutant')
dms_effects_observed = pd.read_csv(config['muteffects_observed'])
natural_mutation_counts_full = pd.read_csv(config['natural_sequence_counts'])

dms_effects_observed['site'] = dms_effects_observed['reference_site']
dms_effects_observed['observed_effect'] = dms_effects_observed['effect']

natural_mutation_counts_full = natural_mutation_counts_full.rename(columns={'reference_site': 'site'})
natural_mutation_counts_full['site'] = natural_mutation_counts_full['site'].astype(str)

natural_effects_full = natural_effects_full.merge(dms_effects_observed[['site', 'mutant', 'observed_effect', 'times_seen']], on=['site', 'mutant'])

natural_effects_full = natural_effects_full.merge(natural_mutation_counts_full, on=['site', 'mutant'], how='left')

natural_effects_full_filtered = natural_effects_full.query('times_seen>3')

Assign mutations to different groups based on their frequency in the natural sequence alignment:

[4]:
natural_effects_full_filtered['cutoff'] = False
previous_cutoff = 1
for cutoff in [3, 5, 10, 50, 100, 10000]:
    if previous_cutoff == 100:
        natural_effects_full_filtered.loc[natural_effects_full_filtered['count'] > previous_cutoff, 'cutoff'] = ">100"
    else:
        natural_effects_full_filtered.loc[natural_effects_full_filtered['count'] > previous_cutoff, 'cutoff'] = f"{previous_cutoff}-{cutoff}"
    previous_cutoff = cutoff

natural_effects_full_filtered.loc[natural_effects_full_filtered['mutant'] == '*', 'cutoff'] = 'stop codons'
natural_effects_full_filtered = natural_effects_full_filtered.query('cutoff!=False')
natural_effects_full_filtered['cutoff'] = natural_effects_full_filtered['cutoff'].astype(str)

Plot the data:

[5]:
score = 'observed_effect'
# get smoothed distribution of scores
bins = numpy.linspace(
    natural_effects_full_filtered.query('cutoff!=False')[score].min(),
    natural_effects_full_filtered.query('cutoff!=False')[score].max(),
    num=50,
)
smoothed_dist = pd.concat(
    [
        pd.DataFrame(
            {
                "cutoff": cutoff,
                score: bins,
                "count": scipy.stats.gaussian_kde(df[score])(bins),
                "mean_score": df[score].mean(),
            }
        )
        for cutoff, df in natural_effects_full_filtered.query('cutoff!=False').groupby(["cutoff"])
    ]
)

# ridgeline plot: https://altair-viz.github.io/gallery/ridgeline_plot.html
facet_height = 27
facet_overlap = 0.7
ridgeline_chart = (
    alt.Chart(smoothed_dist)
    .encode(
        x=alt.X(score, title="score", scale=alt.Scale(nice=False)),
        y=alt.Y(
            "count",
            axis=None,
            scale=alt.Scale(range=[facet_height, -facet_overlap * facet_height]),
        ),
        fill=alt.Fill(
            "mean_score:Q",
            title="mean score",
            legend=alt.Legend(direction="horizontal"),
            scale=alt.Scale(scheme="yellowgreenblue"),
        ),
        row=alt.Row(
            "cutoff",
            title=None,
            header=alt.Header(labelAlign="left", labelAngle=0),
            spacing=0,
            sort=[
                "stop codons",
                "1-3",
                "3-5",
                "5-10",
                "10-50",
                "50-100",
                "100-10000",
            ],
        ),
        #column=alt.Column(
        #    "library",
        #    title=None,
        #    header=alt.Header(
        #        labelFontWeight="bold",
        #        labelPadding=facet_height * facet_overlap,
        #    ),
        #),
        tooltip=[
            "cutoff",
            alt.Tooltip("mean_score", format=".2f", title="mean score"),
        ],
    )
    .mark_area(
        interpolate="monotone",
        smooth=True,
        fillOpacity=0.8,
        stroke="lightgray",
        strokeWidth=0.5,
    )
    .configure_view(stroke=None)
    .configure_axis(grid=False)
    .properties(width=200, height=facet_height, bounds="flush")
)

ridgeline_chart
[5]:
[ ]: