Source code for sasmol.utilities

# from __future__ import absolute_import

# from __future__ import unicode_literals
#
#    SASMOL: Copyright (C) 2011 Joseph E. Curtis, Ph.D.
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# UTIL
#
# 12/10/2009	--	initial coding				:	jc
# 11/24/2011	-- 	moved to seperate file      :	jc
# 12/27/2015	-- 	refactored for release      :   jc
# 08/19/2016	-- 	added doc strings           :   jc
#
# 1         2         3         4         5         6         7
# LC4567890123456789012345678901234567890123456789012345678901234567890123456789
# *      **
'''
	Util holds general methods for file naming, os differences,
        chemical formula parsing, etc.
'''
import sasmol.system as system
import re
import string
import copy
import numpy


[docs]def find_unique(this_list): return list(numpy.unique(this_list))
[docs]class Copy_Using_Mask(): #@classmethod #def from_sasmol(class_instance, mask, **kwargs):
[docs] @classmethod def from_sasmol(cls, class_instance, mask, **kwargs): ''' Parameters __________ class_instance system object mask integer list kwargs optional future keyword arguments Returns _______ molecule new system object Examples -------- >>> import sasmol.system as system >>> import sasmol.util as utilties >>> molecule = system.Molecule('hiv1_gag.pdb') >>> molecule.name()[0] 'N' >>> molecule.name()[4] 'CA' >>> molecule.name()[14] 'HB1' >>> mask = [0, 4, 14] >>> new_molecule = utilities.Copy_Using_Mask.from_sasmol(molecule, mask) >>> new_molecule.name() ['N', 'CA', 'HB1'] >>> new_molecule.mass() array([ 14.00672 , 12.01078 , 1.007947]) Note ____ if more attributes are added in system.Atom() then the key lists below need to be updated currently only list_keys and numpy_keys are returned int_keys would have to be recalculated based on mask short_keys would have to be re-initialized (init_children) may want to consider passing a list of specific attributes to extract if memory is an issue copies all frames (untested) why can't this method be in subset? ''' list_keys = ['_residue_flag', '_occupancy', '_charge', '_atom', '_chain', '_segname', '_beta', '_loc', '_element', '_name', '_rescode', '_moltype', '_resname'] numpy_keys = ['_original_index', '_original_resid', '_index', '_resid', '_mass', '_coor'] short_keys = ['_resnames', '_resids', '_elements', '_segnames', '_betas', '_names', '_moltypes', '_occupancies'] int_keys = ['_number_of_chains', '_number_of_betas', '_number_of_resids', '_number_of_names', '_number_of_moltypes', '_number_of_resnames', '_number_of_segnames', '_number_of_elements', '_id', '_number_of_occupancies'] other_keys = ['_header', '_conect', '_debug'] new_dict = {} number_of_frames = len(class_instance.coor()) mask_length = len(mask) natoms = class_instance._natoms all_data = [[] for x in range(natoms)] for i in range(natoms): if i in mask: count = 0 for key, value in class_instance.__dict__.items(): if key in list_keys: all_data[count].append(value[i]) count += 1 count = 0 for key, value in class_instance.__dict__.items(): if key in list_keys: new_dict[key] = all_data[count] count += 1 molecule = system.Molecule() molecule.__dict__ = new_dict molecule.setId(0) molecule.setNatoms(mask_length) molecule.setOriginal_index(numpy.take( class_instance.original_index(), mask)) molecule.setOriginal_resid(numpy.take( class_instance.original_resid(), mask)) molecule.setIndex(numpy.take(class_instance.index(), mask)) molecule.setResid(numpy.take(class_instance.resid(), mask)) molecule.setMass(numpy.take(class_instance.mass(), mask)) molecule.setCoor(numpy.take(class_instance.coor(), mask, axis=1)) return molecule
[docs]def duplicate_molecule(molecule, number_of_duplicates): return [copy.deepcopy(molecule) for x in range(number_of_duplicates)]
NAME, NUM, LPAREN, RPAREN, EOS = list(range(5)) _lexer = re.compile(r"[A-Z][a-z]*|\d+|[()]|<EOS>").match del re # symbol, name, atomic number, molecular weight _data = r"""'Ac', 'Actinium', 89, 227 'Ag', 'Silver', 47, 107.868 'Al', 'Aluminum', 13, 26.98154 'Am', 'Americium', 95, 243 'Ar', 'Argon', 18, 39.948 'As', 'Arsenic', 33, 74.9216 'At', 'Astatine', 85, 210 'Au', 'Gold', 79, 196.9665 'B', 'Boron', 5, 10.81 'Ba', 'Barium', 56, 137.33 'Be', 'Beryllium', 4, 9.01218 'Bi', 'Bismuth', 83, 208.9804 'Bk', 'Berkelium', 97, 247 'Br', 'Bromine', 35, 79.904 'C', 'Carbon', 6, 12.011 'Ca', 'Calcium', 20, 40.08 'Cd', 'Cadmium', 48, 112.41 'Ce', 'Cerium', 58, 140.12 'Cf', 'Californium', 98, 251 'Cl', 'Chlorine', 17, 35.453 'Cm', 'Curium', 96, 247 'Co', 'Cobalt', 27, 58.9332 'Cr', 'Chromium', 24, 51.996 'Cs', 'Cesium', 55, 132.9054 'Cu', 'Copper', 29, 63.546 'Dy', 'Dysprosium', 66, 162.50 'Er', 'Erbium', 68, 167.26 'Es', 'Einsteinium', 99, 254 'Eu', 'Europium', 63, 151.96 'F', 'Fluorine', 9, 18.998403 'Fe', 'Iron', 26, 55.847 'Fm', 'Fermium', 100, 257 'Fr', 'Francium', 87, 223 'Ga', 'Gallium', 31, 69.735 'Gd', 'Gadolinium', 64, 157.25 'Ge', 'Germanium', 32, 72.59 'H', 'Hydrogen', 1, 1.0079 'He', 'Helium', 2, 4.0026 'Hf', 'Hafnium', 72, 178.49 'Hg', 'Mercury', 80, 200.59 'Ho', 'Holmium', 67, 164.9304 'I', 'Iodine', 53, 126.9045 'In', 'Indium', 49, 114.82 'Ir', 'Iridium', 77, 192.22 'K', 'Potassium', 19, 39.0983 'Kr', 'Krypton', 36, 83.80 'La', 'Lanthanum', 57, 138.9055 'Li', 'Lithium', 3, 6.94 'Lr', 'Lawrencium', 103, 260 'Lu', 'Lutetium', 71, 174.96 'Md', 'Mendelevium', 101, 258 'Mg', 'Magnesium', 12, 24.305 'Mn', 'Manganese', 25, 54.9380 'Mo', 'Molybdenum', 42, 95.94 'N', 'Nitrogen', 7, 14.0067 'Na', 'Sodium', 11, 22.98977 'Nb', 'Niobium', 41, 92.9064 'Nd', 'Neodymium', 60, 144.24 'Ne', 'Neon', 10, 20.17 'Ni', 'Nickel', 28, 58.71 'No', 'Nobelium', 102, 259 'Np', 'Neptunium', 93, 237.0482 'O', 'Oxygen', 8, 15.9994 'Os', 'Osmium', 76, 190.2 'P', 'Phosphorous', 15, 30.97376 'Pa', 'Proactinium', 91, 231.0359 'Pb', 'Lead', 82, 207.2 'Pd', 'Palladium', 46, 106.4 'Pm', 'Promethium', 61, 145 'Po', 'Polonium', 84, 209 'Pr', 'Praseodymium', 59, 140.9077 'Pt', 'Platinum', 78, 195.09 'Pu', 'Plutonium', 94, 244 'Ra', 'Radium', 88, 226.0254 'Rb', 'Rubidium', 37, 85.467 'Re', 'Rhenium', 75, 186.207 'Rh', 'Rhodium', 45, 102.9055 'Rn', 'Radon', 86, 222 'Ru', 'Ruthenium', 44, 101.07 'S', 'Sulfur', 16, 32.06 'Sb', 'Antimony', 51, 121.75 'Sc', 'Scandium', 21, 44.9559 'Se', 'Selenium', 34, 78.96 'Si', 'Silicon', 14, 28.0855 'Sm', 'Samarium', 62, 150.4 'Sn', 'Tin', 50, 118.69 'Sr', 'Strontium', 38, 87.62 'Ta', 'Tantalum', 73, 180.947 'Tb', 'Terbium', 65, 158.9254 'Tc', 'Technetium', 43, 98.9062 'Te', 'Tellurium', 52, 127.60 'Th', 'Thorium', 90, 232.0381 'Ti', 'Titanium', 22, 47.90 'Tl', 'Thallium', 81, 204.37 'Tm', 'Thulium', 69, 168.9342 'U', 'Uranium', 92, 238.029 'Unh', 'Unnilhexium', 106, 263 'Unp', 'Unnilpentium', 105, 260 'Unq', 'Unnilquadium', 104, 260 'Uns', 'Unnilseptium', 107, 262 'V', 'Vanadium', 23, 50.9415 'W', 'Tungsten', 74, 183.85 'Xe', 'Xenon', 54, 131.30 'Y', 'Yttrium', 39, 88.9059 'Yb', 'Ytterbium', 70, 173.04 'Zn', 'Zinc', 30, 65.38 'Zr', 'Zirconium', 40, 91.22 'D', 'Deuterium', 1, 2.158"""
[docs]class Element: def __init__(self, symbol, name, atomicnumber, molweight): self.sym = symbol self.name = name self.ano = atomicnumber self.mw = molweight
[docs] def getweight(self): return self.mw
[docs] def addsyms(self, weight, result): result[self.sym] = result.get(self.sym, 0) + weight
[docs]def build_dict(s): import string answer = {} for line in s.split("\n"): symbol, name, num, weight = eval(line) answer[symbol] = Element(symbol, name, num, weight) return answer
[docs]class ElementSequence: def __init__(self, *seq): self.seq = list(seq) self.count = 1
[docs] def append(self, thing): self.seq.append(thing)
[docs] def getweight(self): sum = 0.0 for thing in self.seq: sum = sum + thing.getweight() return sum * self.count
[docs] def set_count(self, n): self.count = n
def __len__(self): return len(self.seq)
[docs] def addsyms(self, weight, result): totalweight = weight * self.count for thing in self.seq: thing.addsyms(totalweight, result)
[docs] def displaysyms(self, sym2elt): result = {} self.addsyms(1, result) items = list(result.items()) items.sort() for sym, count in items: print(sym, " : ", count)
[docs]class Tokenizer: def __init__(self, input): self.input = input + "<EOS>" self.i = 0
[docs] def gettoken(self): global ttype, tvalue self.lasti = self.i m = _lexer(self.input, self.i) if m is None: self.error("unexpected character") self.i = m.end() tvalue = m.group() if tvalue == "(": ttype = LPAREN elif tvalue == ")": ttype = RPAREN elif tvalue == "<EOS>": ttype = EOS elif "0" <= tvalue[0] <= "9": ttype = NUM tvalue = int(tvalue) else: ttype = NAME
[docs] def error(self, msg): emsg = msg + ":\n" emsg = emsg + self.input[:-5] + "\n" # strip <EOS> emsg = emsg + " " * self.lasti + "^\n" raise ValueError(emsg)
[docs]def parse(s, sym2elt): global t, ttype, tvalue t = Tokenizer(s) t.gettoken() seq = parse_sequence(sym2elt) if ttype != EOS: t.error("expected end of input") return seq
[docs]def parse_sequence(sym2elt): global t, ttype, tvalue seq = ElementSequence() while ttype in (LPAREN, NAME): # parenthesized expression or straight name if ttype == LPAREN: t.gettoken() thisguy = parse_sequence(sym2elt) if ttype != RPAREN: t.error("expected right paren") t.gettoken() else: assert ttype == NAME if tvalue in sym2elt: thisguy = ElementSequence(sym2elt[tvalue]) else: t.error("'" + tvalue + "' is not an element symbol") t.gettoken() # followed by optional count if ttype == NUM: thisguy.set_count(tvalue) t.gettoken() seq.append(thisguy) if len(seq) == 0: t.error("empty sequence") return seq
[docs]def get_chemical_formula(formula_string): # standard_atomic_weights = Atomic.amu(keep_lower_case=True) sym2elt = build_dict(_data) # #sym2elt = amu formula_dictionary = {} error = [] try: seq = parse(formula_string.strip(" "), sym2elt) # seq.displaysyms(sym2elt) seq.addsyms(1, formula_dictionary) items = list(formula_dictionary.items()) items.sort() # for sym, count in items: # print sym," :: ",count except ValueError as detail: error.append(str(detail)) return error, formula_dictionary
if __name__ == "__main__": path = './' filename = 'run_0' ''' while 1: x = input("? ") fields = string.split(x) if len(fields) != 2: print("input must have two fields") continue action, formula = fields ok = 0 try: seq = parse(formula,sym2elt) ok = 1 except ValueError as detail: print(str(detail)) if not ok: continue if action == "molw": print("molecular weight", seq.getweight()) elif action == "syms": seq.displaysyms(sym2elt) else: print("unknown action:", action) '''
[docs]def parse_fasta(fasta_sequence, **kwargs): """ method to convert fasta_sequence object to list of strings for each valid sequence in the initial object format is based on the NCBI fasta format convention https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp notes: Parameters ---------- fasta_sequence list with formatted fasta input kwargs optional future arguments Returns ------- all_sequences a list containing sequences without comments, spaces, carriage returns, numbers, or termination flags. or an error indicating an empty line in the file Examples ------- >>> import sasmol.sasutil as sasutil >>> fasta_sequence = open('test_fasta.txt', 'r').readlines() >>> all_sequences = sasutil.parse_fasta(fasta_sequence) >>> print(all_sequences) Note ---- 1) lines beginning with > or ; are treated as comments and passed 2) spaces are ignored 3) * are ignored (should be a termination) 4) numbers are ignored so you can have numbering at the beginning of a line 5) \n are processed 6) comment lines are NOT required in the input 7) comment lines cause a new sequence to be started """ all_sequences = [] for line in fasta_sequence: # check if line is empty if line.strip() == '': error = 'ERROR: empty lines in fasta sequence are not allowed' print(error) return error # check if first character is the comment identifier elif line[0] == '>' or line[0] == ';': # try to add completed sequence to full list (if it exists) try: if new_sequence != '': all_sequences.append(new_sequence) except: pass new_sequence = '' else: for char in line: if not char.isspace() and char != '*' and not char.isdigit(): try: new_sequence += char except: new_sequence = char if new_sequence != '': all_sequences.append(new_sequence) return all_sequences
[docs]def check_integrity(obj, fast_check=False, warn=True): import logging import sasmol.config as config logger = logging.getLogger(__name__) natoms = obj.natoms() results = {} errors = [] core_keys = [ '_atom', '_index', '_name', '_loc', '_resname', '_chain', '_resid', '_rescode', '_occupancy', '_beta', '_segname', '_element', '_charge' ] for key in core_keys: val = obj.__dict__.get(key, None) if val is None: results[key] = None continue try: length = len(val) results[key] = length if length != natoms: errors.append((key, length, natoms)) except: results[key] = 'N/A' if errors and warn: logger.warning('Integrity check mismatch: %s', errors) elif config.__logging_level__ == 'DEBUG': logger.debug('Integrity check passed: natoms=%s', natoms) return results