Source code for alhambra._tilesets_dx

"""DX Tile TileSet routines."""

from __future__ import annotations

from collections import Counter
from datetime import datetime, timezone
import logging
from typing import (
    Optional,
    TYPE_CHECKING,
    Any,
    Callable,
    Iterable,
    Literal,
    Sequence,
    cast,
)
from random import shuffle
import stickydesign.multimodel as multimodel
import numpy as np
import stickydesign

[docs]SELOGGER = logging.getLogger(__name__)
from stickydesign.energetics_daoe import EnergeticsDAOE from .glues import DXGlue, GlueList from .util import ( # DEFAULT_ENERGETICS,; DEFAULT_SD2_MULTIMODEL_ENERGETICS, DEFAULT_MM_ENERGETICS_NAMES, DEFAULT_MULTIMODEL_ENERGETICS, DEFAULT_REGION_ENERGETICS, DEFAULT_ENERGETICS, ) from . import seq, util if TYPE_CHECKING: from .tilesets import TileSet
[docs]def stickydesign_create_dx_glue_sequences( self, method: Literal["default", "multimodel"] = "default", energetics: "stickydesign.Energetics" | None = None, trials: int = 100, devmethod="dev", sdopts: dict[str, Any] | None = None, ecpars: dict[str, Any] | None = None, listends: bool = False, ) -> tuple[TileSet, Sequence[str]]: """Create sticky end sequences for the TileSet, using stickydesign, and returning a new TileSet including the ends. Parameters ---------- method: {'default', 'multimodel'} if 'default', use the default, single-model sequence design. If 'multimodel', use multimodel end choice. energetics : stickydesign.Energetics the energetics instance to use for the design, or list of energetics for method='multimodel', in which case the first will be the primary. If None (default), will use alhambra.DEFAULT_ENERGETICS, or, if method='multimodel', will use alhambra.DEFAULT_MM_ENERGETICS. trials : int the number of trials to attempt. FIXME sdopts : dict a dictionary of parameters to pass to stickydesign.easy_ends. ecpars : dict a dictionary of parameters to pass to the endchooser function generator (useful only in limited circumstances). listends : bool if False, return just the TileSet. If True, return both the TileSet and a list of the names of the ends that were created. Returns ------- tileset : TileSet TileSet with designed end sequences included. new_ends : list Names of the ends that were designed. """ if sdopts is None: sdopts = {} if ecpars is None: ecpars = {} info: dict[str, Any] = {} info["method"] = method info["time"] = datetime.now(tz=timezone.utc).isoformat() info["sd_version"] = stickydesign.version.__version__ if not energetics: if method == "multimodel": all_energetics = DEFAULT_MULTIMODEL_ENERGETICS else: energetics = DEFAULT_ENERGETICS if method == "multimodel" and not isinstance(energetics, Iterable): raise ValueError("Energetics must be an iterable for multimodel.") elif method == "multimodel": all_energetics = cast(list[EnergeticsDAOE], energetics) energetics = all_energetics[0] info["energetics"] = [str(e) for e in all_energetics] info["trails"] = trials elif method == "default": info["energetics"] = str(energetics) energetics = cast(EnergeticsDAOE, energetics) else: raise ValueError # Steps for doing this: # Build a list of ends from the endlist in the tileset. Do this # by creating a NamedList, then merging them into it. ends: GlueList[DXGlue] = GlueList() ends.update(g for g in self.glues if isinstance(g, DXGlue)) ends.update(g for g in self.tiles.glues_from_tiles() if isinstance(g, DXGlue)) # Ensure that if there are any resulting completely-undefined ends, they # have their sequences removed. # for end in ends: # if end.fseq and set(end.fseq) == {'n'}: # del(end.fseq) # Build inputs suitable for stickydesign: lists of old sequences for TD/DT, # and numbers of new sequences needed. oldDTseqs = [ end.fseq for end in ends if end.etype == "DT" and seq.is_definite(end.fseq) ] if oldDTseqs: oldDTarray = stickydesign.endarray(oldDTseqs, "DT") else: oldDTarray = None oldTDseqs = [ end.fseq for end in ends if end.etype == "TD" and seq.is_definite(end.fseq) ] if oldTDseqs: oldTDarray = stickydesign.endarray(oldTDseqs, "TD") else: oldTDarray = None newTD = [end for end in ends if end.etype == "TD" and not seq.is_definite(end.fseq)] newDT = [end for end in ends if end.etype == "DT" and not seq.is_definite(end.fseq)] # Deal with energetics, considering potential old sequences. # FIXME: EXPLAIN WHAT THIS ABSTRUSE CODE DOES... # TODO: tests needs to test this targets = [] if len(oldDTseqs) == 0 and len(oldTDseqs) == 0: targets.append( stickydesign.enhist("DT", 5, energetics=energetics)[2]["emedian"] ) targets.append( stickydesign.enhist("TD", 5, energetics=energetics)[2]["emedian"] ) if oldDTarray: targets.append(energetics.matching_uniform(oldDTarray)) if oldTDarray: targets.append(energetics.matching_uniform(oldTDarray)) targetint = np.average(targets) if any(not seq.is_null(end.fseq) for end in newTD): TDtemplates = [end.fseq for end in newTD] else: TDtemplates = [] if any(not seq.is_null(end.fseq) for end in newDT): DTtemplates = [end.fseq for end in newDT] else: DTtemplates = [] if method == "default": if TDtemplates or DTtemplates: raise NotImplementedError # Create new sequences. newTDseqs = stickydesign.easyends( "TD", 5, number=len(newTD), energetics=energetics, interaction=targetint, **sdopts, ).tolist() newDTseqs = stickydesign.easyends( "DT", 5, number=len(newDT), energetics=energetics, interaction=targetint, **sdopts, ).tolist() elif method == "multimodel": SELOGGER.info( "starting multimodel sticky end generation " + "of TD ends for {} DT and {} TD ends, {} trials.".format( len(newDT), len(newTD), trials ) ) newTDseqs = [] pl = util.ProgressLogger(SELOGGER, trials * 2) presetavail = None for i in range(0, trials): endchooserTD = multimodel.endchooser( all_energetics, templates=TDtemplates, devmethod=devmethod, **ecpars ) e, presetavail = stickydesign.easyends( "TD", 5, number=len(newTD), oldends=oldTDseqs, energetics=energetics, interaction=targetint, echoose=endchooserTD, _presetavail=presetavail, **sdopts, ) newTDseqs.append(e) pl.update(i) if oldTDarray: tvals = [ [e.matching_uniform(oldTDarray[0:1]) for e in all_energetics] * len(newTDseqs) ] * len(newTDseqs) SELOGGER.debug(tvals[0]) else: tvals = [ [e.matching_uniform(x[0:1]) for e in all_energetics] for x in newTDseqs ] endchoosersDT = [ multimodel.endchooser( all_energetics, target_vals=tval, templates=DTtemplates, devmethod=devmethod, **ecpars, ) for tval in tvals ] SELOGGER.info("generating corresponding DT ends") newDTseqs = [] presetavail = None for i, echoose in enumerate(endchoosersDT): e, presetavail = stickydesign.easyends( "DT", 5, number=len(newDT), oldends=oldDTseqs, energetics=energetics, interaction=targetint, echoose=echoose, _presetavail=presetavail, **sdopts, ) newDTseqs.append(e) pl.update(i + trials) arr = [ [ stickydesign.endarray(oldTDseqs + x.tolist(), "TD"), stickydesign.endarray(oldDTseqs + y.tolist(), "DT"), ] for x, y in zip(newTDseqs, newDTseqs) ] scores = [ multimodel.deviation_score(list(e), all_energetics, devmethod=devmethod) for e in arr ] sort = np.argsort(scores) newTDseqs = newTDseqs[sort[0]].tolist()[len(oldTDseqs) :] newDTseqs = newDTseqs[sort[0]].tolist()[len(oldDTseqs) :] info["score"] = float(scores[sort[0]]) info["maxscore"] = float(scores[sort[-1]]) info["meanscore"] = float(np.mean(scores)) # FIXME: move to stickydesign assert len(newTDseqs) == len(newTD) assert len(newDTseqs) == len(newDT) # Shuffle the lists of end sequences, to ensure that they're # random order, and that ends used earlier in the set are not # always better than those used later. But only shuffle if # there were no templates: if not TDtemplates: shuffle(newTDseqs) if not DTtemplates: shuffle(newDTseqs) # Make sure things are consistent if there are templates: if TDtemplates: for t, s in zip(TDtemplates, newTDseqs): seq.merge(t, s) if DTtemplates: for t, s in zip(DTtemplates, newDTseqs): seq.merge(t, s) for end, s in zip(newDT, newDTseqs): ends[end.ident()].fseq = s for end, s in zip(newTD, newTDseqs): ends[end.ident()].fseq = s # Ensure that the old and new sets have consistent end definitions, # and that the tile definitions still fit. self.glues.update(ends) # self.tiles.glues_from_tiles().update(ends) newendnames = [e.name for e in newTD] + [e.name for e in newDT] info["newends"] = newendnames # Apply new sequences to tile system. self.ends = ends # if "info" not in self.keys(): # self["info"] = {} # if "end_design" not in self["info"].keys(): # self["info"]["end_design"] = [] # if isinstance("end_design", dict): # convert old # self["info"]["end_design"] = [self["info"]["end_design"]] # self["info"]["end_design"].append(info) return self, newendnames
[docs]def dx_plot_se_hists( self: TileSet, all_energetics: Sequence[stickydesign.Energetics] | None = None, energetics_names: Sequence[str] | None = None, title: str | None = None, **kwargs: Any, ) -> Any: """Plot histograms of sticky end energies, using stickydesign.plots.hist_multi. Parameters ---------- all_energetics : list of Energetics A list of energetics to use. Defaults to DEFAULT_MULTIMODEL_ENERGETICS. energetics_names : list of str Names for energetics in all_energetics. Defaults to DEFAULT_MM_ENERGETICS_NAMES. title : str Title for the plot. **kwargs kwargs passed to stickydesign.plots.hist_multi. """ if all_energetics is None: all_energetics = DEFAULT_MULTIMODEL_ENERGETICS if energetics_names is None: energetics_names = DEFAULT_MM_ENERGETICS_NAMES ends = self.glues if title is None: # FIXME title = "Title" td = stickydesign.endarray( [x.fseq for x in ends if isinstance(x, DXGlue) and x.etype == "TD"], "TD" ) dt = stickydesign.endarray( [x.fseq for x in ends if isinstance(x, DXGlue) and x.etype == "DT"], "DT" ) import stickydesign.plots as sdplots return sdplots.hist_multi( [td, dt], all_energetics, energetics_names, title, **kwargs )
[docs]def dx_plot_se_lv( self: TileSet, all_energetics: Sequence[stickydesign.Energetics] | None = None, energetics_names: Sequence[str] | None = None, pltcmd: Optional[Callable[[Any], Any]] = None, title: str | None = None, **kwargs, ): """ Uses an LV plot to show sticky end energetics. """ if all_energetics is None: all_energetics = DEFAULT_MULTIMODEL_ENERGETICS if energetics_names is None: energetics_names = DEFAULT_MM_ENERGETICS_NAMES import stickydesign.plots as sdplots m, s = sdplots._multi_data_pandas( self.glues.to_endarrays(), all_energetics, energetics_names ) import matplotlib.pyplot as plt import seaborn as sns if pltcmd is None: pltcmd = sns.lvplot pltcmd(data=m, **kwargs) # type: ignore pltcmd(data=s, marker="x", **kwargs) # type: ignore if title: plt.title(title) plt.ylabel("Energy (kcal/mol)")
[docs]def dx_plot_adjacent_regions(self: TileSet, energetics=None): """ Plots the strength of double-stranded regions in DX tiles adjacent to sticky ends. Parameters ---------- energetics : stickydesign.Energetics The energetics to use. Defaults to DEFAULT_REGION_ENERGETICS. """ if energetics is None: energetics = DEFAULT_REGION_ENERGETICS regions = [t.structure._side_bound_regions(t) for t in self.tiles] regions = [[x.lower() for x in y] for y in regions] allregions: list[str] = sum(regions, []) count: list[list[Counter]] = [[Counter(x) for x in y] for y in regions] gc_count = [[x["g"] + x["c"] for x in c] for c in count] gc_counts: list[int] = sum(gc_count, []) ens = energetics.matching_uniform(stickydesign.endarray(allregions, "DT")) from matplotlib import pylab pylab.figure(figsize=(10, 4)) pylab.subplot(121) pylab.hist(gc_counts, bins=np.arange(min(gc_counts) - 0.5, max(gc_counts) + 0.5)) pylab.title("G/C pairs in arms") pylab.ylabel("# of 8 nt arms") pylab.xlabel("# of G/C pairs") pylab.subplot(122) pylab.hist(ens) pylab.title("ΔG, T=33, no coaxparams/danglecorr") pylab.ylabel("# of 8 nt regions") pylab.xlabel("stickydesign ΔG") pylab.suptitle("8 nt end-adjacent region strengths")
[docs]def dx_plot_side_strands(self: TileSet, energetics=None): """ Plots the binding strength of short strands in DX tiles. Parameters ---------- energetics : stickydesign.Energetics The energetics to use. Defaults to DEFAULT_REGION_ENERGETICS. """ if energetics is None: energetics = DEFAULT_REGION_ENERGETICS regions = [t.structure._short_bound_full(t) for t in self.tiles] regions = [[x.lower() for x in y] for y in regions] allregions: list[str] = sum(regions, []) count: list[list[Counter]] = [[Counter(x) for x in y] for y in regions] gc_count = [[x["g"] + x["c"] for x in c] for c in count] gc_counts: list[int] = sum(gc_count, []) ens = energetics.matching_uniform(stickydesign.endarray(allregions, "DT")) from matplotlib import pylab pylab.figure(figsize=(10, 4)) pylab.subplot(121) pylab.hist(gc_counts, bins=np.arange(min(gc_counts) - 0.5, max(gc_counts) + 0.5)) pylab.title("G/C pairs in arms") pylab.ylabel("# of 8 nt arms") pylab.xlabel("# of G/C pairs") pylab.subplot(122) pylab.hist(ens) pylab.title("ΔG, T=33, no coaxparams/danglecorr") pylab.ylabel("# of 16 nt regions") pylab.xlabel("stickydesign ΔG") pylab.suptitle("16 nt arm region strengths")