# from __future__ import absolute_import
# from __future__ import unicode_literals
#
# SASMOL: Copyright (C) 2011 Joseph E. Curtis, Ph.D.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# UTIL
#
# 12/10/2009 -- initial coding : jc
# 11/24/2011 -- moved to seperate file : jc
# 12/27/2015 -- refactored for release : jc
# 08/19/2016 -- added doc strings : jc
#
# 1 2 3 4 5 6 7
# LC4567890123456789012345678901234567890123456789012345678901234567890123456789
# * **
'''
Util holds general methods for file naming, os differences,
chemical formula parsing, etc.
'''
import sasmol.system as system
import re
import string
import copy
import numpy
[docs]def find_unique(this_list):
return list(numpy.unique(this_list))
[docs]class Copy_Using_Mask():
#@classmethod
#def from_sasmol(class_instance, mask, **kwargs):
[docs] @classmethod
def from_sasmol(cls, class_instance, mask, **kwargs):
'''
Parameters
__________
class_instance
system object
mask
integer list
kwargs
optional future keyword arguments
Returns
_______
molecule
new system object
Examples
--------
>>> import sasmol.system as system
>>> import sasmol.util as utilties
>>> molecule = system.Molecule('hiv1_gag.pdb')
>>> molecule.name()[0]
'N'
>>> molecule.name()[4]
'CA'
>>> molecule.name()[14]
'HB1'
>>> mask = [0, 4, 14]
>>> new_molecule = utilities.Copy_Using_Mask.from_sasmol(molecule, mask)
>>> new_molecule.name()
['N', 'CA', 'HB1']
>>> new_molecule.mass()
array([ 14.00672 , 12.01078 , 1.007947])
Note
____
if more attributes are added in system.Atom() then the key lists below need
to be updated
currently only list_keys and numpy_keys are returned
int_keys would have to be recalculated based on mask
short_keys would have to be re-initialized (init_children)
may want to consider passing a list of specific attributes to extract if memory is an issue
copies all frames (untested)
why can't this method be in subset?
'''
list_keys = ['_residue_flag', '_occupancy', '_charge', '_atom', '_chain', '_segname',
'_beta', '_loc', '_element', '_name', '_rescode', '_moltype', '_resname']
numpy_keys = ['_original_index', '_original_resid',
'_index', '_resid', '_mass', '_coor']
short_keys = ['_resnames', '_resids', '_elements',
'_segnames', '_betas', '_names', '_moltypes', '_occupancies']
int_keys = ['_number_of_chains', '_number_of_betas', '_number_of_resids', '_number_of_names', '_number_of_moltypes',
'_number_of_resnames', '_number_of_segnames', '_number_of_elements', '_id', '_number_of_occupancies']
other_keys = ['_header', '_conect', '_debug']
new_dict = {}
number_of_frames = len(class_instance.coor())
mask_length = len(mask)
natoms = class_instance._natoms
all_data = [[] for x in range(natoms)]
for i in range(natoms):
if i in mask:
count = 0
for key, value in class_instance.__dict__.items():
if key in list_keys:
all_data[count].append(value[i])
count += 1
count = 0
for key, value in class_instance.__dict__.items():
if key in list_keys:
new_dict[key] = all_data[count]
count += 1
molecule = system.Molecule()
molecule.__dict__ = new_dict
molecule.setId(0)
molecule.setNatoms(mask_length)
molecule.setOriginal_index(numpy.take(
class_instance.original_index(), mask))
molecule.setOriginal_resid(numpy.take(
class_instance.original_resid(), mask))
molecule.setIndex(numpy.take(class_instance.index(), mask))
molecule.setResid(numpy.take(class_instance.resid(), mask))
molecule.setMass(numpy.take(class_instance.mass(), mask))
molecule.setCoor(numpy.take(class_instance.coor(), mask, axis=1))
return molecule
[docs]def duplicate_molecule(molecule, number_of_duplicates):
return [copy.deepcopy(molecule) for x in range(number_of_duplicates)]
NAME, NUM, LPAREN, RPAREN, EOS = list(range(5))
_lexer = re.compile(r"[A-Z][a-z]*|\d+|[()]|<EOS>").match
del re
# symbol, name, atomic number, molecular weight
_data = r"""'Ac', 'Actinium', 89, 227
'Ag', 'Silver', 47, 107.868
'Al', 'Aluminum', 13, 26.98154
'Am', 'Americium', 95, 243
'Ar', 'Argon', 18, 39.948
'As', 'Arsenic', 33, 74.9216
'At', 'Astatine', 85, 210
'Au', 'Gold', 79, 196.9665
'B', 'Boron', 5, 10.81
'Ba', 'Barium', 56, 137.33
'Be', 'Beryllium', 4, 9.01218
'Bi', 'Bismuth', 83, 208.9804
'Bk', 'Berkelium', 97, 247
'Br', 'Bromine', 35, 79.904
'C', 'Carbon', 6, 12.011
'Ca', 'Calcium', 20, 40.08
'Cd', 'Cadmium', 48, 112.41
'Ce', 'Cerium', 58, 140.12
'Cf', 'Californium', 98, 251
'Cl', 'Chlorine', 17, 35.453
'Cm', 'Curium', 96, 247
'Co', 'Cobalt', 27, 58.9332
'Cr', 'Chromium', 24, 51.996
'Cs', 'Cesium', 55, 132.9054
'Cu', 'Copper', 29, 63.546
'Dy', 'Dysprosium', 66, 162.50
'Er', 'Erbium', 68, 167.26
'Es', 'Einsteinium', 99, 254
'Eu', 'Europium', 63, 151.96
'F', 'Fluorine', 9, 18.998403
'Fe', 'Iron', 26, 55.847
'Fm', 'Fermium', 100, 257
'Fr', 'Francium', 87, 223
'Ga', 'Gallium', 31, 69.735
'Gd', 'Gadolinium', 64, 157.25
'Ge', 'Germanium', 32, 72.59
'H', 'Hydrogen', 1, 1.0079
'He', 'Helium', 2, 4.0026
'Hf', 'Hafnium', 72, 178.49
'Hg', 'Mercury', 80, 200.59
'Ho', 'Holmium', 67, 164.9304
'I', 'Iodine', 53, 126.9045
'In', 'Indium', 49, 114.82
'Ir', 'Iridium', 77, 192.22
'K', 'Potassium', 19, 39.0983
'Kr', 'Krypton', 36, 83.80
'La', 'Lanthanum', 57, 138.9055
'Li', 'Lithium', 3, 6.94
'Lr', 'Lawrencium', 103, 260
'Lu', 'Lutetium', 71, 174.96
'Md', 'Mendelevium', 101, 258
'Mg', 'Magnesium', 12, 24.305
'Mn', 'Manganese', 25, 54.9380
'Mo', 'Molybdenum', 42, 95.94
'N', 'Nitrogen', 7, 14.0067
'Na', 'Sodium', 11, 22.98977
'Nb', 'Niobium', 41, 92.9064
'Nd', 'Neodymium', 60, 144.24
'Ne', 'Neon', 10, 20.17
'Ni', 'Nickel', 28, 58.71
'No', 'Nobelium', 102, 259
'Np', 'Neptunium', 93, 237.0482
'O', 'Oxygen', 8, 15.9994
'Os', 'Osmium', 76, 190.2
'P', 'Phosphorous', 15, 30.97376
'Pa', 'Proactinium', 91, 231.0359
'Pb', 'Lead', 82, 207.2
'Pd', 'Palladium', 46, 106.4
'Pm', 'Promethium', 61, 145
'Po', 'Polonium', 84, 209
'Pr', 'Praseodymium', 59, 140.9077
'Pt', 'Platinum', 78, 195.09
'Pu', 'Plutonium', 94, 244
'Ra', 'Radium', 88, 226.0254
'Rb', 'Rubidium', 37, 85.467
'Re', 'Rhenium', 75, 186.207
'Rh', 'Rhodium', 45, 102.9055
'Rn', 'Radon', 86, 222
'Ru', 'Ruthenium', 44, 101.07
'S', 'Sulfur', 16, 32.06
'Sb', 'Antimony', 51, 121.75
'Sc', 'Scandium', 21, 44.9559
'Se', 'Selenium', 34, 78.96
'Si', 'Silicon', 14, 28.0855
'Sm', 'Samarium', 62, 150.4
'Sn', 'Tin', 50, 118.69
'Sr', 'Strontium', 38, 87.62
'Ta', 'Tantalum', 73, 180.947
'Tb', 'Terbium', 65, 158.9254
'Tc', 'Technetium', 43, 98.9062
'Te', 'Tellurium', 52, 127.60
'Th', 'Thorium', 90, 232.0381
'Ti', 'Titanium', 22, 47.90
'Tl', 'Thallium', 81, 204.37
'Tm', 'Thulium', 69, 168.9342
'U', 'Uranium', 92, 238.029
'Unh', 'Unnilhexium', 106, 263
'Unp', 'Unnilpentium', 105, 260
'Unq', 'Unnilquadium', 104, 260
'Uns', 'Unnilseptium', 107, 262
'V', 'Vanadium', 23, 50.9415
'W', 'Tungsten', 74, 183.85
'Xe', 'Xenon', 54, 131.30
'Y', 'Yttrium', 39, 88.9059
'Yb', 'Ytterbium', 70, 173.04
'Zn', 'Zinc', 30, 65.38
'Zr', 'Zirconium', 40, 91.22
'D', 'Deuterium', 1, 2.158"""
[docs]class Element:
def __init__(self, symbol, name, atomicnumber, molweight):
self.sym = symbol
self.name = name
self.ano = atomicnumber
self.mw = molweight
[docs] def getweight(self):
return self.mw
[docs] def addsyms(self, weight, result):
result[self.sym] = result.get(self.sym, 0) + weight
[docs]def build_dict(s):
import string
answer = {}
for line in s.split("\n"):
symbol, name, num, weight = eval(line)
answer[symbol] = Element(symbol, name, num, weight)
return answer
[docs]class ElementSequence:
def __init__(self, *seq):
self.seq = list(seq)
self.count = 1
[docs] def append(self, thing):
self.seq.append(thing)
[docs] def getweight(self):
sum = 0.0
for thing in self.seq:
sum = sum + thing.getweight()
return sum * self.count
[docs] def set_count(self, n):
self.count = n
def __len__(self):
return len(self.seq)
[docs] def addsyms(self, weight, result):
totalweight = weight * self.count
for thing in self.seq:
thing.addsyms(totalweight, result)
[docs] def displaysyms(self, sym2elt):
result = {}
self.addsyms(1, result)
items = list(result.items())
items.sort()
for sym, count in items:
print(sym, " : ", count)
[docs]class Tokenizer:
def __init__(self, input):
self.input = input + "<EOS>"
self.i = 0
[docs] def gettoken(self):
global ttype, tvalue
self.lasti = self.i
m = _lexer(self.input, self.i)
if m is None:
self.error("unexpected character")
self.i = m.end()
tvalue = m.group()
if tvalue == "(":
ttype = LPAREN
elif tvalue == ")":
ttype = RPAREN
elif tvalue == "<EOS>":
ttype = EOS
elif "0" <= tvalue[0] <= "9":
ttype = NUM
tvalue = int(tvalue)
else:
ttype = NAME
[docs] def error(self, msg):
emsg = msg + ":\n"
emsg = emsg + self.input[:-5] + "\n" # strip <EOS>
emsg = emsg + " " * self.lasti + "^\n"
raise ValueError(emsg)
[docs]def parse(s, sym2elt):
global t, ttype, tvalue
t = Tokenizer(s)
t.gettoken()
seq = parse_sequence(sym2elt)
if ttype != EOS:
t.error("expected end of input")
return seq
[docs]def parse_sequence(sym2elt):
global t, ttype, tvalue
seq = ElementSequence()
while ttype in (LPAREN, NAME):
# parenthesized expression or straight name
if ttype == LPAREN:
t.gettoken()
thisguy = parse_sequence(sym2elt)
if ttype != RPAREN:
t.error("expected right paren")
t.gettoken()
else:
assert ttype == NAME
if tvalue in sym2elt:
thisguy = ElementSequence(sym2elt[tvalue])
else:
t.error("'" + tvalue + "' is not an element symbol")
t.gettoken()
# followed by optional count
if ttype == NUM:
thisguy.set_count(tvalue)
t.gettoken()
seq.append(thisguy)
if len(seq) == 0:
t.error("empty sequence")
return seq
if __name__ == "__main__":
path = './'
filename = 'run_0'
'''
while 1:
x = input("? ")
fields = string.split(x)
if len(fields) != 2:
print("input must have two fields")
continue
action, formula = fields
ok = 0
try:
seq = parse(formula,sym2elt)
ok = 1
except ValueError as detail:
print(str(detail))
if not ok:
continue
if action == "molw":
print("molecular weight", seq.getweight())
elif action == "syms":
seq.displaysyms(sym2elt)
else:
print("unknown action:", action)
'''
[docs]def parse_fasta(fasta_sequence, **kwargs):
"""
method to convert fasta_sequence object to list of strings
for each valid sequence in the initial object
format is based on the NCBI fasta format convention
https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp
notes:
Parameters
----------
fasta_sequence
list with formatted fasta input
kwargs
optional future arguments
Returns
-------
all_sequences
a list containing sequences without comments, spaces, carriage returns, numbers,
or termination flags.
or
an error indicating an empty line in the file
Examples
-------
>>> import sasmol.sasutil as sasutil
>>> fasta_sequence = open('test_fasta.txt', 'r').readlines()
>>> all_sequences = sasutil.parse_fasta(fasta_sequence)
>>> print(all_sequences)
Note
----
1) lines beginning with > or ; are treated as comments and passed
2) spaces are ignored
3) * are ignored (should be a termination)
4) numbers are ignored so you can have numbering at the beginning of a line
5) \n are processed
6) comment lines are NOT required in the input
7) comment lines cause a new sequence to be started
"""
all_sequences = []
for line in fasta_sequence:
# check if line is empty
if line.strip() == '':
error = 'ERROR: empty lines in fasta sequence are not allowed'
print(error)
return error
# check if first character is the comment identifier
elif line[0] == '>' or line[0] == ';':
# try to add completed sequence to full list (if it exists)
try:
if new_sequence != '':
all_sequences.append(new_sequence)
except:
pass
new_sequence = ''
else:
for char in line:
if not char.isspace() and char != '*' and not char.isdigit():
try:
new_sequence += char
except:
new_sequence = char
if new_sequence != '':
all_sequences.append(new_sequence)
return all_sequences
[docs]def check_integrity(obj, fast_check=False, warn=True):
import logging
import sasmol.config as config
logger = logging.getLogger(__name__)
natoms = obj.natoms()
results = {}
errors = []
core_keys = [
'_atom', '_index', '_name', '_loc', '_resname', '_chain',
'_resid', '_rescode', '_occupancy', '_beta',
'_segname', '_element', '_charge'
]
for key in core_keys:
val = obj.__dict__.get(key, None)
if val is None:
results[key] = None
continue
try:
length = len(val)
results[key] = length
if length != natoms:
errors.append((key, length, natoms))
except:
results[key] = 'N/A'
if errors and warn:
logger.warning('Integrity check mismatch: %s', errors)
elif config.__logging_level__ == 'DEBUG':
logger.debug('Integrity check passed: natoms=%s', natoms)
return results