001 /*
002 * NeedlemanWunsch.java
003 *
004 * Copyright 2003 Sergio Anibal de Carvalho Junior
005 *
006 * This file is part of NeoBio.
007 *
008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009 * the GNU General Public License as published by the Free Software Foundation; either
010 * version 2 of the License, or (at your option) any later version.
011 *
012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014 * PURPOSE. See the GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License along with NeoBio;
017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018 * Boston, MA 02111-1307, USA.
019 *
020 * Proper attribution of the author as the source of the software would be appreciated.
021 *
022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
023 * Department of Computer Science http://www.dcs.kcl.ac.uk
024 * King's College London, UK http://www.kcl.ac.uk
025 *
026 * Please visit http://neobio.sourceforge.net
027 *
028 * This project was supervised by Professor Maxime Crochemore.
029 *
030 */
031
032 package neobio.alignment;
033
034 import java.io.Reader;
035 import java.io.IOException;
036
037 /**
038 * This class implements the classic global alignment algorithm (with linear gap penalty
039 * function) due to S.B.Needleman and C.D.Wunsch (1970).
040 *
041 * <P>It is based on a dynamic programming approach. The idea consists of, given two
042 * sequences A and B of sizes n and m, respectively, building an (n+1 x m+1) matrix M that
043 * contains the similarity of prefixes of A and B. Every position M[i,j] in the matrix
044 * holds the score between the subsequences A[1..i] and B[1..j]. The first row and column
045 * represent alignments with spaces.</P>
046 *
047 * <P>Starting from row 0, column 0, the algorithm computes each position M[i,j] with the
048 * following recurrence:</P>
049 *
050 * <CODE><BLOCKQUOTE><PRE>
051 * M[0,0] = 0
052 * M[i,j] = max { M[i,j-1] + scoreInsertion (B[j]),
053 * M[i-1,j-1] + scoreSubstitution (A[i], B[j]),
054 * M[i-1,j] + scoreDeletion(A[i]) }
055 * </PRE></BLOCKQUOTE></CODE>
056 *
057 * <P>In the end, the value at the last position (last row, last column) will contain
058 * the similarity between the two sequences. This part of the algorithm is accomplished
059 * by the {@link #computeMatrix computeMatrix} method. It has quadratic space complexity
060 * since it needs to keep an (n+1 x m+1) matrix in memory. And since the work of computing
061 * each cell is constant, it also has quadratic time complexity.</P>
062 *
063 * <P>After the matrix has been computed, the alignment can be retrieved by tracing a path
064 * back in the matrix from the last position to the first. This step is performed by
065 * the {@link #buildOptimalAlignment buildOptimalAlignment} method, and since the path can
066 * be roughly as long as (m + n), this method has O(n) time complexity.</P>
067 *
068 * <P>If the similarity value only is needed (and not the alignment itself), it is easy to
069 * reduce the space requirement to O(n) by keeping just the last row or column in memory.
070 * This is precisely what is done by the {@link #computeScore computeScore} method. Note
071 * that it still requires O(n<SUP>2</SUP>) time.</P>
072 *
073 * <P>For a more efficient approach to the global alignment problem, see the
074 * {@linkplain CrochemoreLandauZivUkelson} algorithm. For local alignment, see the
075 * {@linkplain SmithWaterman} algorithm.</P>
076 *
077 * @author Sergio A. de Carvalho Jr.
078 * @see SmithWaterman
079 * @see CrochemoreLandauZivUkelson
080 * @see CrochemoreLandauZivUkelsonLocalAlignment
081 * @see CrochemoreLandauZivUkelsonGlobalAlignment
082 */
083 public class NeedlemanWunsch extends PairwiseAlignmentAlgorithm
084 {
085 /**
086 * The first sequence of an alignment.
087 */
088 protected CharSequence seq1;
089
090 /**
091 * The second sequence of an alignment.
092 */
093 protected CharSequence seq2;
094
095 /**
096 * The dynamic programming matrix. Each position (i, j) represents the best score
097 * between the firsts i characters of <CODE>seq1</CODE> and j characters of
098 * <CODE>seq2</CODE>.
099 */
100 protected int[][] matrix;
101
102 /**
103 * Loads sequences into {@linkplain CharSequence} instances. In case of any error,
104 * an exception is raised by the constructor of <CODE>CharSequence</CODE> (please
105 * check the specification of that class for specific requirements).
106 *
107 * @param input1 Input for first sequence
108 * @param input2 Input for second sequence
109 * @throws IOException If an I/O error occurs when reading the sequences
110 * @throws InvalidSequenceException If the sequences are not valid
111 * @see CharSequence
112 */
113 protected void loadSequencesInternal (Reader input1, Reader input2)
114 throws IOException, InvalidSequenceException
115 {
116 // load sequences into instances of CharSequence
117 this.seq1 = new CharSequence(input1);
118 this.seq2 = new CharSequence(input2);
119 }
120
121 /**
122 * Frees pointers to loaded sequences and the dynamic programming matrix so that their
123 * data can be garbage collected.
124 */
125 protected void unloadSequencesInternal ()
126 {
127 this.seq1 = null;
128 this.seq2 = null;
129 this.matrix = null;
130 }
131
132 /**
133 * Builds an optimal global alignment between the loaded sequences after computing the
134 * dynamic programming matrix. It calls the <CODE>buildOptimalAlignment</CODE> method
135 * after the <CODE>computeMatrix</CODE> method computes the dynamic programming
136 * matrix.
137 *
138 * @return an optimal global alignment between the loaded sequences
139 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
140 * with the loaded sequences.
141 * @see #computeMatrix
142 * @see #buildOptimalAlignment
143 */
144 protected PairwiseAlignment computePairwiseAlignment ()
145 throws IncompatibleScoringSchemeException
146 {
147 // compute the matrix
148 computeMatrix ();
149
150 // build and return an optimal global alignment
151 PairwiseAlignment alignment = buildOptimalAlignment ();
152
153 // allow the matrix to be garbage collected
154 matrix = null;
155
156 return alignment;
157 }
158
159 /**
160 * Computes the dynamic programming matrix.
161 *
162 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
163 * with the loaded sequences.
164 */
165 protected void computeMatrix () throws IncompatibleScoringSchemeException
166 {
167 int r, c, rows, cols, ins, del, sub;
168
169 rows = seq1.length()+1;
170 cols = seq2.length()+1;
171
172 matrix = new int [rows][cols];
173
174 // initiate first row
175 matrix[0][0] = 0;
176 for (c = 1; c < cols; c++)
177 matrix[0][c] = matrix[0][c-1] + scoreInsertion(seq2.charAt(c));
178
179 // calculates the similarity matrix (row-wise)
180 for (r = 1; r < rows; r++)
181 {
182 // initiate first column
183 matrix[r][0] = matrix[r-1][0] + scoreDeletion(seq1.charAt(r));
184
185 for (c = 1; c < cols; c++)
186 {
187 ins = matrix[r][c-1] + scoreInsertion(seq2.charAt(c));
188 sub = matrix[r-1][c-1] + scoreSubstitution(seq1.charAt(r),seq2.charAt(c));
189 del = matrix[r-1][c] + scoreDeletion(seq1.charAt(r));
190
191 // choose the greatest
192 matrix[r][c] = max (ins, sub, del);
193 }
194 }
195 }
196
197 /**
198 * Builds an optimal global alignment between the loaded sequences. Before it is
199 * executed, the dynamic programming matrix must already have been computed by
200 * the <CODE>computeMatrix</CODE> method.
201 *
202 * @return an optimal global alignment between the loaded sequences
203 * @throws IncompatibleScoringSchemeException If the scoring scheme
204 * is not compatible with the loaded sequences.
205 * @see #computeMatrix
206 */
207 protected PairwiseAlignment buildOptimalAlignment ()
208 throws IncompatibleScoringSchemeException
209 {
210 StringBuffer gapped_seq1, score_tag_line, gapped_seq2;
211 int r, c, sub, max_score;
212
213 gapped_seq1 = new StringBuffer();
214 score_tag_line = new StringBuffer();
215 gapped_seq2 = new StringBuffer();
216
217 // start at the last row, last column
218 r = matrix.length - 1;
219 c = matrix[r].length - 1;
220 max_score = matrix[r][c];
221
222 while ((r > 0) || (c > 0))
223 {
224 if (c > 0)
225 if (matrix[r][c] == matrix[r][c-1] + scoreInsertion(seq2.charAt(c)))
226 {
227 // insertion was used
228 gapped_seq1.insert (0, GAP_CHARACTER);
229 score_tag_line.insert (0, GAP_TAG);
230 gapped_seq2.insert (0, seq2.charAt(c));
231 c = c - 1;
232
233 // skip to the next iteration
234 continue;
235 }
236
237 if ((r > 0) && (c > 0))
238 {
239 sub = scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
240
241 if (matrix[r][c] == matrix[r-1][c-1] + sub)
242 {
243 // substitution was used
244 gapped_seq1.insert (0, seq1.charAt(r));
245 if (seq1.charAt(r) == seq2.charAt(c))
246 if (useMatchTag())
247 score_tag_line.insert (0, MATCH_TAG);
248 else
249 score_tag_line.insert (0, seq1.charAt(r));
250 else if (sub > 0)
251 score_tag_line.insert (0, APPROXIMATE_MATCH_TAG);
252 else
253 score_tag_line.insert (0, MISMATCH_TAG);
254 gapped_seq2.insert (0, seq2.charAt(c));
255 r = r - 1; c = c - 1;
256
257 // skip to the next iteration
258 continue;
259 }
260 }
261
262 // must be a deletion
263 gapped_seq1.insert (0, seq1.charAt(r));
264 score_tag_line.insert (0, GAP_TAG);
265 gapped_seq2.insert (0, GAP_CHARACTER);
266 r = r - 1;
267 }
268
269 return new PairwiseAlignment (gapped_seq1.toString(), score_tag_line.toString(),
270 gapped_seq2.toString(), max_score);
271 }
272
273 /**
274 * Computes the score of the best global alignment between the two sequences using the
275 * scoring scheme previously set. This method calculates the similarity value only
276 * (doesn't build the whole matrix so the alignment cannot be recovered, however it
277 * has the advantage of requiring O(n) space only).
278 *
279 * @return score of the best global alignment between the loaded sequences
280 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
281 * with the loaded sequences.
282 */
283 protected int computeScore () throws IncompatibleScoringSchemeException
284 {
285 int[] array;
286 int r, c, rows, cols, tmp, ins, del, sub;
287
288 rows = seq1.length()+1;
289 cols = seq2.length()+1;
290
291 if (rows <= cols)
292 {
293 // goes columnwise
294 array = new int [rows];
295
296 // initiate first column
297 array[0] = 0;
298 for (r = 1; r < rows; r++)
299 array[r] = array[r-1] + scoreDeletion(seq1.charAt(r));
300
301 // calculate the similarity matrix (keep current column only)
302 for (c = 1; c < cols; c++)
303 {
304 // initiate first row (tmp hold values
305 // that will be later moved to the array)
306 tmp = array[0] + scoreInsertion(seq2.charAt(c));
307
308 for (r = 1; r < rows; r++)
309 {
310 ins = array[r] + scoreInsertion(seq2.charAt(c));
311 sub = array[r-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
312 del = tmp + scoreDeletion(seq1.charAt(r));
313
314 // move the temp value to the array
315 array[r-1] = tmp;
316
317 // choose the greatest
318 tmp = max (ins, sub, del);
319 }
320
321 // move the temp value to the array
322 array[rows - 1] = tmp;
323 }
324
325 return array[rows - 1];
326 }
327 else
328 {
329 // goes rowwise
330 array = new int [cols];
331
332 // initiate first row
333 array[0] = 0;
334 for (c = 1; c < cols; c++)
335 array[c] = array[c-1] + scoreInsertion(seq2.charAt(c));
336
337 // calculate the similarity matrix (keep current row only)
338 for (r = 1; r < rows; r++)
339 {
340 // initiate first column (tmp hold values
341 // that will be later moved to the array)
342 tmp = array[0] + scoreDeletion(seq1.charAt(r));
343
344 for (c = 1; c < cols; c++)
345 {
346 ins = tmp + scoreInsertion(seq2.charAt(c));
347 sub = array[c-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
348 del = array[c] + scoreDeletion(seq1.charAt(r));
349
350 // move the temp value to the array
351 array[c-1] = tmp;
352
353 // choose the greatest
354 tmp = max (ins, sub, del);
355 }
356
357 // move the temp value to the array
358 array[cols - 1] = tmp;
359 }
360
361 return array[cols - 1];
362 }
363 }
364 }