#!/usr/bin/python
# You may need to change this to /usr/local/bin/python

program_name = 'xpm2matrix'

# Name: 
#    xpm2matrix
# Version:
#    v0.2,  2007-8-03
# Author:
#    Andrew Jewett, Shea Group, University of California at Santa Barbara
# License:
#    GNU LGPL and GPL (General Public Licence)
# Comments:
#    Be forgiving.  It's my first python program
#   (I think one of the reasons it is so slow is because the program has to 
#    read in the entire .xpm file and transpose the data before it can begin.
#    This seems to be where the program is slowing down.  However it could also
#    be because I am using memory inefficienly (making temporary copies).
#    Regardless, for really long trajectories, or large systems,
#    the arrays inside the xpm files can be quite large.
#    I hope the authors of g_hbond will later provide an option to save
#    the xpm data with the x and y axis exchanged.
#    That way, I could just read in and process one line at a time.)
#
# Description below:

docs = 'syntax (example):\n' \
+ '\n' \
+ '   '+program_name+' hbond.ndx groups1.ndx groups2.ndx [DA,AD] < hbmap.xpm > con.mat\n' \
+ '\n' \
+ 'Description:\n' \
+ '\n' \
+ '    This program reads .xpm files generated by \"g_hbond -hbm ...\",\n' \
+ ' and, for each frame in the trajectory, prints out a 2-dimensional table\n' \
+ ' indicating which pairs of atoms (or residues, or groups of atoms)\n' \
+ ' are connected together (a connectivity matrix). This table may be easier\n' \
+ ' to interpret than the original .xpm file.  As an input it requires a pair\n' \
+ ' of files (\"groups1.ndx\" and \"groups2.ndx\" in this example) and each file\n' \
+ ' contains a list of groups.  (Each group typically represents the atoms\n' \
+ ' belonging to each amino acid or nucleotide in a molecule.)  Since there\n' \
+ ' are two lists of groups, you can look at the connectivity matrix between\n'\
+ ' two different molecules.  The the two lists of groups do not have to\n' \
+ ' correspond to residues from different molecules.  They can be identical.\n' \
+ '     The atoms in each group are not required to be donors or acceptors\n'\
+ ' (other atoms will be ignored).  It is possible to tell '+program_name+'\n'\
+ ' to only consider the donor atoms from one group, and the acceptor atoms\n'\
+ ' from another group. (See \"DA\" / \"AD\" below).\n' \
+ '     Again, groups do not have to correspond to residues.  Of course, there\n' \
+ ' can be multiple donors and acceptors in each group, and more than one h-bond\n' \
+ ' connecting each pair of groups.  For example, one of list of groups\n' \
+ ' (say \"groups2.ndx\") might contain only a single group consisting of all\n' \
+ ' of the solvent atoms.  In that case, for every frame in the trajectory\n' \
+ ' '+program_name+' would simply report the number of water molecules bonded to \n' \
+ ' every residue/group in the molecule.\n' \
+ '\n' \
+ 'Output (stored in the \"con.mat\" file in the example above):\n' \
+ '\n' \
+ '    For each frame in the trajectory used to generate the xpm file, this\n' \
+ ' program generates a connectivity matrix (delimited by spaces and newlines):\n' \
+ '\n' \
+ 'c11 c12 c13 ... c1n\n' \
+ 'c21 c22 c23 ... c2n\n' \
+ 'c31 c32 c33 ... c3n\n' \
+ ' :   :   :       :\n' \
+ 'cm1 cm2 cm3 ... cmn\n' \
+ '\n' \
+ ' where cij = the number of hydrogen bonds connecting group i (from the\n' \
+ '             first list of groups) with group j (from the second list)\n' \
+ '\n' \
+ 'Input:\n' \
+ '\n' \
+ ' hbmap.xpm    An .xpm file created by \"g_hbond -hbm\" that you want to analyze\n' \
+ '              (See documentation for g_hbond.)\n' \
+ '\n' \
+ 'Arguments:\n' \
+ '\n' \
+ ' hbond.ndx    The ndx file created by \"g_hbond -hbn ...\"\n' \
+ ' groups1.ndx     The number of hydrogen bonds is counted between every pair\n' \
+ ' groups2.ndx     of groups in these two index files: groups1.ndx & groups2.ndx.\n' \
+ ' DA or AD     The 4th argument DA / AD is optional.\n' \
+ '\n' \
+ '              By using the \"DA\" and \"AD\" arguments, the user can limit\n' \
+ '              the atoms that the program considers from groups1.ndx\n'\
+ '              and groups2.ndx.  The DA command line argument causes\n' \
+ '              '+program_name+' to ignore all atoms from the first list of\n' \
+ '              groups which are not potential donor atoms, and all atoms\n' \
+ '              from the second list which are not potential acceptors.\n' \
+ '              (The reverse is true for the AD command.)\n' \
+ '\n'\
+ 'Performance Issues:\n' \
+ '\n'\
+ ' This program can consume lots of memory, due to the order that data is\n' \
+ ' saved to the xpm file, especially for long trajectories.  If the program\n'\
+ ' is performing very slowly (running out of memory), you can get around the\n'\
+ ' problem by running g_hbond multiple times using the \"-b\" and \"-e\" flags,\n'+ ' in order to generate multiple smaller .xpm files.  You can run ' \
+program_name+'\n'\
+ ' multiple times and concatenate the results together.\n\n'


#  Read_hbond2atompairs(file)
# Read in the "hbond.ndx" file generated by "g_hbond -hbn hbond.ndx":
# Convert this information into a dictionary of tuples.
# (A simple list of tuples would also work, and be simpler.  Oh well.)
# Later, when we read in the body of the .xpm file, this lookup table
# will tell how the position in the .xpm file tells us
# which pair of (heavy) atoms are connected by the hydrogen bond?


def Read_hbond2atompairs(file):
	missing = -1
	lines = file.readlines()
	hb2ap = [] # <-which (DA) pairs of atoms correspond to each h-bond?
	redundant = [] # <-if we have read that pair already indicate that here
	               # (To avoid counting the same donor/acceptor pair twice)
	pairs = {} # <-used to keep track of which pairs we have read in so far

	for i in range(len(lines)-1, 0, -1):
		if (lines[i].find('[') == -1):
			tokens = lines[i].split()
			if len(tokens) == 3:
				[D, H, A] = lines[i].split()
				hb2ap.append((int(D), int(A)))
				if pairs.get((int(D),int(A)),missing)==missing:
					redundant.append(False)
					pairs[(int(D),int(A))] = True
				else:
					redundant.append(True)

			elif len(tokens) > 0:
				print '\n ---- Format error: ----\n' \
				'Where: the .ndx file that should have been created by \"g_hbond -hbn\"\n' \
				'Explanation: Most likely, you have specified your files in the wrong order.\n\n' \
				' ---- error details: ---- \n' \
				'You have the wrong number of entries on line ' \
				+str(i+1)+'\n'+'of the .ndx file.\n'
				'This file is supposed to contain h-bond triplets (DHA) in the last group.\n' \
				'Consequently, the last group should contain 3 atoms per line.'
				sys.exit(-2)
		else:
			break

	# We started this loop at the end of the list and read it in in reverse
	# order. I would think we need to reverse it again:  Consequently:
	#hb2ap.reverse()     # <-- oops.  NO!
	#redundant.reverse() # <-- no
	#COMMENTING PREVIOUS LINES: Don't reverse this list! g_hbond apparently
	#   creates the hbond.ndx list in the OPPOSITE order as the hbmap.xpm
	#   ..So just leave hb2ap the way it is (reversed).

	return hb2ap,redundant  # <-- Return a list of pairs of atoms, hb2ap
	                        #  The caller is warned if a pair is redundant


# Scan through the .xpm file to find the dimensions of the map to find:
#   The number of save points in the trajectory (= number of columns), and
#   the number of possible h-bonds (= number of rows).
#   Note: Table must be pre-allocated.

def ReadTableDimensions(file):
	for line in file:
		if line.find('*') == -1:
			dimensions = line.replace('\"', "").split()
			return int(dimensions[0]), int(dimensions[1])

# TranposeTable() reads in all the lines of a file. After removing the quotes
# the remaining data is assumed to be a 2-dimensional array of characters.
# We copy this two dimensional array of characters to a table (a 2-D list)
# however we transpose it as we copy it.
# (Why do we transpose the data?
#  Because g_hbond saves creates a .xpm file with the data saved
#  in a way which the array in a strange format where the data from 
#  different times, appears in different COLUMNS (not rows).
#  This is the opposite order that we need to access the data, 
#  so we transpose it now.)

def TransposeTable(file, table):
	j = 0
	for line in file:
		if (line.find('*') == -1) and (line.find('"') != -1):
			line_no_quotes = line.replace('\"','') # get rid of "
			for i in range(0,len(line_no_quotes)):
				c = line_no_quotes[i]
				if ((c != '\n') and (c != ',')):
					#print 'i,j,\'c\' = '+str(i)+','+str(j)+'\''+c+'\''
					table[i][j] = c #copy & transpose
			j += 1


# Read the list of atoms belonging to every group
# (By "group", I mean the groups that the user wants us to keep track of when
#  we count the number of h-bonds between groups. These two lists of groups are
#  indicated by the 2nd and 3rd command line arguments to the main program.

def Read_atoms2groups(file):
	a2g = {}
	g_count = bracket_count = 0
	for line in file:
		if (line.find('[') == -1):
			for a in line.split():
				a2g[int(a)] = g_count
		else:
			bracket_count += 1
			if ((len(a2g) != 0) or (bracket_count > 1)):
				g_count+=1
	return a2g, g_count+1 # return the lookup table & number of groups


def PrintBondTable(numbonds):
	for g1 in range(0,len(numbonds)):
		for g2 in range(0,len(numbonds[g1])):
			sys.stdout.write(str(numbonds[g1][g2]))
			if g2+1 < len(numbonds[g1]):
				sys.stdout.write(' ')
			else:
				sys.stdout.write('\n')
	sys.stdout.write('\n')


#  main program:
import sys
if (len(sys.argv) < 4) or (len(sys.argv) > 5):
	print '\nError:\n       Expected at least 3 arguments.\n\n' + docs
	sys.exit(-1)

hb2ap_filename = sys.argv[1]
a2g_filename1 = sys.argv[2]
a2g_filename2 = sys.argv[3]

g1donors_g2acceptors = False
g1acceptors_g2donors = False
if len(sys.argv) > 4:
	if sys.argv[4] == 'DA':
		g1donors_g2acceptors = True
	elif sys.argv[4] == 'AD':
		g1acceptors_g2donors = True

hb2ap,redundant = Read_hbond2atompairs(open(hb2ap_filename,'r'))
a2g1,numgroups1 = Read_atoms2groups(open(a2g_filename1,'r'))
a2g2,numgroups2 = Read_atoms2groups(open(a2g_filename2,'r'))

#print hb2ap
#print redundant
#print a2g1
#print a2g2

# Read in the dimensions of the data in the .xpm file
# This includes the number of bonds, and the duration of the simulation.
# This should agree with the number of rows and columns in the .xpm file.

(sim_duration, numbonds_hb2ap) = ReadTableDimensions(sys.stdin)


# Now that we know how big it is, we create a large array 
# to store the data in the .xpm file

lines = [[' ' for j in range(0,numbonds_hb2ap)] for i in range(0,sim_duration)]

TransposeTable(sys.stdin, lines)

if numbonds_hb2ap != len(hb2ap):
	print '\nError: the number of hbonds [='+str(numbonds_hb2ap)+'] (Donor-Hydrogen-Acceptor triplets)\n'+\
	'       from the file \"'+hb2ap_filename+'\", (created by running \"g_hbond -hbn\")\n'+\
	'       does not match the number of rows of data contained in the\n'+\
	'       .xpm file [='+str(len(hb2ap))+'] (created by running \"g_hbond -hbm\").  Assuming you\n'+\
	'       did not make a mistake entering the filenames, you are probably using\n'+\
	'       buggy version of g_hbond.  The version of g_hbond which ships with\n'+\
	'      gromacs 3.3.1 has this bug.  (But it looks like they fixed it in the\n'+\
	'       CVS version.)  You need to obtain a newer version of g_hbond,\n'+\
	'       and/or download and compile the CVS version.\n'+\
	'       Then you must use it to regenerate your .xpm and .ndx files.\n'
	sys.exit(-3)

# Our goal:  For each moment in time during the simulation,
#            we want to calculate the following:

# numbondsDA[g1][g2] 
# counts the number of times a donor from group g1 bonds to an acceptor from g2

# numbondsAD[g1][g2] 
# counts the number of times an acceptor from group g1 bonds to a donor from g2

# numbonds[g1][g2] 
# counts the total number of hydrogen bonds between groups g1 and g2
#   that is:
#   numbonds[g1][g2] = numbondsDA[g1][g2] + numbondsAD[g1][g2]

missing = -1

for line in lines:
	# preallocate the bond-counter tables to size: numgroups1 x numbroups2:
	numbonds = [[0 for g2 in range(0,numgroups2)] \
	for g1 in range(0,numgroups1)]
	numbondsDA=[[0 for g2 in range(0,numgroups2)] \
	for g1 in range(0,numgroups1)]
	numbondsAD=[[0 for g2 in range(0,numgroups2)] \
	for g1 in range(0,numgroups1)]

	# Loop over all of the possible bonds at this moment in time
	for b in range(0,len(hb2ap)):

		# If the bond exists,update the relevant bond counters:
		if ((line[b] == 'o') and (not redundant[b])):
			D,A = hb2ap[b] # lookup the donor and acceptor atoms
			g1=a2g1.get(D,missing)#ifD belongs to a group in g1.ndx
			g2=a2g2.get(A,missing)#& A belongs to a group in g2.ndx
			#print str(D)+','+str(A)+'{g'+str(g1+1)+',g'+str(g2+1)
			if ((g1!=missing) and (g2!=missing)):# then add 1 to
				numbondsDA[g1][g2] += 1      # the relevant
				numbonds[g1][g2] += 1        # bond counters

			g1=a2g1.get(A,missing)#ifA belongs to a group in g1.ndx
			g2=a2g2.get(D,missing)#& D belongs to a group in g2.ndx
			#print '  and {g'+str(g1+1)+',g'+str(g2+1)+'}'
			if ((g1!=missing) and (g2!=missing)):# then add 1 to
				numbondsAD[g1][g2] +=1       # the relevant
				numbonds[g1][g2] +=1         # bond counters

	if g1donors_g2acceptors:
		#print ' -----DA:---- '
		PrintBondTable(numbondsDA)
	elif g1acceptors_g2donors:
		#print '     -AD:- '
		PrintBondTable(numbondsAD)
	else:
		#print '   -total:- '
		PrintBondTable(numbonds)