Source code for af_analysis.sequence

from string import ascii_lowercase
import numpy as np
import logging

# Logging
logger = logging.getLogger(__name__)


[docs] def convert_aa_msa(seqs): """ Convert amino acid sequences to numbers. """ convert_dict = { "A": 0, "R": 1, "N": 2, "D": 3, "C": 4, "Q": 5, "E": 6, "G": 7, "H": 8, "I": 9, "L": 10, "K": 11, "M": 12, "F": 13, "P": 14, "S": 15, "T": 16, "W": 17, "V": 19, "Y": 18, "X": 20, "-": 21, } seqnums = [] for seq in seqs: seqnums.append([convert_dict[letter] for letter in seq]) return np.array(seqnums)
[docs] def parse_a3m(a3m_lines=None, a3m_file=None, filter_qid=0.15, filter_cov=0.5, N=100000): """ Parses an A3M file or list of A3M lines and filters sequences based on sequence identity and coverage. Parameters ---------- a3m_lines: list of str, optional List of lines from an A3M file. Default is None. a3m_file: str, optional Path to an A3M file. Default is None. filter_qid: float, optional Minimum sequence identity threshold for filtering. Default is 0.15. filter_cov: float, optional Minimum coverage threshold for filtering. Default is 0.5. N: int, optional Maximum number of sequences to return. Default is 100000. Returns ------- tuple: A tuple containing: - seqs (list of str): List of filtered sequences. - mtx (list of list of int): List of deletion matrices corresponding to the sequences. - nams (list of str): List of sequence names. """ def seqid(a, b): return sum(c1 == c2 for c1, c2 in zip(a, b)) def nongaps(a): return sum(c != "-" for c in a) def chk(seq, ref_seq): rL = len(ref_seq) L = nongaps(seq) return not (L > filter_cov * rL and seqid(seq, ref_seq) > filter_qid * L) rm_lower = str.maketrans("", "", ascii_lowercase) if a3m_file is None and a3m_lines is None: raise ValueError("Either a3m_file or a3m_lines must be provided.") # prep inputs if a3m_lines is None: a3m_lines = open(a3m_file, "r") # else: a3m_lines = a3m_lines.splitlines() # parse inputs n, nams, seqs, mtx = 0, [], [], [] def do_filter(): seq = seqs[-1].translate(rm_lower) if "_UPI" in nams[-1] or chk(seq, ref_seq): nams.pop() seqs.pop() else: # deletion matrix deletion_vec = [] deletion_count = 0 for j in seqs[-1]: if j.islower(): deletion_count += 1 else: deletion_vec.append(deletion_count) deletion_count = 0 mtx.append(deletion_vec) seqs[-1] = seq for line in a3m_lines: line = line.rstrip() if line.startswith(">"): if n == 1: ref_seq = seqs[0].translate(rm_lower) if n >= 1: # filter previous entry do_filter() # start new sequence entry nam = line.split()[0][1:] if "_" not in nam: nam = f"X_{nam}" nams.append(nam) seqs.append("") n += 1 else: seqs[-1] += line logger.info(f"- Reading {n:6} sequences.") # filter last entry do_filter() if len(seqs) > N + 1: logger.info( f"found too many sequences ({len(seqs)}), taking the top{N} (sorted by qid)" ) sid = np.argsort([seqid(seq, ref_seq) for seq in seqs])[::-1][: N + 1] seqs = [seqs[i] for i in sid] mtx = [mtx[i] for i in sid] nams = [nams[i] for i in sid] # return seqs[1:], mtx[1:], nams[1:] return seqs, mtx, nams