Source code for pbxplore.analysis.compare
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function, absolute_import
# Third-party module
import numpy
# Local module
from . import utils
from ..io import fasta
def matrix_to_single_digit(matrix):
"""
Convert a similarity substitution matrix to a single digit distance
substitution matrix
This function takes a substitution matrix expressed as similarity scores,
and expresses it as integer distances in the range [0; 9]. Where 0 means
similar PBs, and 9 means different PBs.
Such single digit substitution matrix is convenient to display sequences
of substitution scores.
..note:
If a substitution matrix expressed with distances is provided
rather than a matrix expressed with similarity scores, then the
returned matrix is expressed with similarity scores.
"""
mini = numpy.min(matrix)
maxi = numpy.max(matrix)
# Normalize between 0 and 1
mat_modified = (matrix - mini)/(maxi - mini)
# Convert similarity scores to distances and change the range to [0; 9]
mat_modified = 9 * (1 - mat_modified)
# Convert to integers
mat_modified = mat_modified.astype(int)
# Set diagonal to 0
numpy.fill_diagonal(mat_modified, 0)
return mat_modified
def compare_to_first_sequence(headers, sequences, substitution_mat):
"""
Compare all sequence to the first one
"""
iheaders = iter(headers)
isequences = iter(sequences)
ref_name = next(iheaders)
ref_seq = next(isequences)
for target_header, target_seq in zip(iheaders, isequences):
header = "%s vs %s" % (ref_name, target_header)
score_lst = utils.compute_score_by_position(substitution_mat, ref_seq, target_seq)
yield header, score_lst
[docs]def compare(header_lst, seq_lst, substitution_mat, fname):
"""
Command line wrapper for the comparison of all sequences with the first one
When the --compare option is given to the command line, the program
compares all the sequences to the first one and writes these comparison as
sequences of digits. These digits represent the distance between the PB
in the target and the one in the reference at the same position. The digits
are normalized in the [0; 9] range.
This function run the comparison, write the result in a fasta file, and
display on screen informations about the process.
Parameters
----------
header_lst: list of strings
The list of sequence headers ordered as the sequences
seq_lst: list of strings
The list of sequences ordered as the headers
substitution_mat: numpy.array
A substitution matrix expressed as similarity scores
fname: str
The output file name
"""
ref_name = header_lst[0]
substitution_mat_modified = matrix_to_single_digit(substitution_mat)
print("Normalized substitution matrix (between 0 and 9)")
print(substitution_mat_modified)
print("Compare first sequence ({0}) with others".format(ref_name))
with open(fname, 'w') as outfile:
for header, score_lst in compare_to_first_sequence(header_lst, seq_lst,
substitution_mat_modified):
seq = "".join([str(s) for s in score_lst])
fasta._write_fasta_entry(outfile, seq, header)
print("wrote {0}".format(fname))