001 /*
002 * SmithWaterman.java
003 *
004 * Copyright 2003 Sergio Anibal de Carvalho Junior
005 *
006 * This file is part of NeoBio.
007 *
008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009 * the GNU General Public License as published by the Free Software Foundation; either
010 * version 2 of the License, or (at your option) any later version.
011 *
012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014 * PURPOSE. See the GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License along with NeoBio;
017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018 * Boston, MA 02111-1307, USA.
019 *
020 * Proper attribution of the author as the source of the software would be appreciated.
021 *
022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
023 * Department of Computer Science http://www.dcs.kcl.ac.uk
024 * King's College London, UK http://www.kcl.ac.uk
025 *
026 * Please visit http://neobio.sourceforge.net
027 *
028 * This project was supervised by Professor Maxime Crochemore.
029 *
030 */
031
032 package neobio.alignment;
033
034 import java.io.Reader;
035 import java.io.IOException;
036
037 /**
038 * This class implement the classic local alignment algorithm (with linear gap penalty
039 * function) due to T.F.Smith and M.S.Waterman (1981).
040 *
041 * <P>This algorithm is very similar to the {@linkplain NeedlemanWunsch} algorithm for
042 * global alignment. The idea here also consists of building an (n+1 x m+1) matrix M given
043 * two sequences A and B of sizes n and m, respectively. However, unlike in the global
044 * alignment case, every position M[i,j] in the matrix contains the similarity score of
045 * <B>suffixes</B> of A[1..i] and B[1..j].</P>
046 *
047 * <P>Starting from row 0, column 0, the {@link #computeMatrix computeMatrix} method
048 * computes each position M[i,j] with the following recurrence:</P>
049 *
050 * <CODE><BLOCKQUOTE><PRE>
051 * M[0,0] = <B>M[0,j]</B> = <B>M[i,0]</B> = 0
052 * M[i,j] = max { M[i,j-1] + scoreInsertion (B[j]),
053 * M[i-1,j-1] + scoreSubstitution (A[i], B[j]),
054 * M[i-1,j] + scoreDeletion(A[i]) }
055 * </PRE></BLOCKQUOTE></CODE>
056 *
057 * <P>Note that, here, all cells in the first row and column are set to zero. The best
058 * local alignment score is the highest value found anywhere in the matrix.</P>
059 *
060 * <P>Just like in global alignment case, this algorithm has quadratic space complexity
061 * because it needs to keep an (n+1 x m+1) matrix in memory. And since the work of
062 * computing each cell is constant, it also has quadratic time complexity.</P>
063 *
064 * <P>After the matrix has been computed, the alignment can be retrieved by tracing a path
065 * back in the matrix from the position of the highest score until a cell of value zero is
066 * reached. This step is performed by the {@link #buildOptimalAlignment
067 * buildOptimalAlignment} method, and its time complexity is linear on the size of the
068 * alignment.
069 *
070 * <P>If the similarity value only is needed (and not the alignment itself), it is easy to
071 * reduce the space requirement to O(n) by keeping just the last row or column in memory.
072 * This is precisely what is done by the {@link #computeScore computeScore} method. Note
073 * that it still requires O(n<SUP>2</SUP>) time.</P>
074 *
075 * <P>For a more efficient approach to the local alignment problem, see the
076 * {@linkplain CrochemoreLandauZivUkelson} algorithm. For global alignment, see the
077 * {@linkplain NeedlemanWunsch} algorithm.</P>
078 *
079 * @author Sergio A. de Carvalho Jr.
080 * @see NeedlemanWunsch
081 * @see CrochemoreLandauZivUkelson
082 * @see CrochemoreLandauZivUkelsonLocalAlignment
083 * @see CrochemoreLandauZivUkelsonGlobalAlignment
084 */
085 public class SmithWaterman extends PairwiseAlignmentAlgorithm
086 {
087 /**
088 * The first sequence of an alignment.
089 */
090 protected CharSequence seq1;
091
092 /**
093 * The second sequence of an alignment.
094 */
095 protected CharSequence seq2;
096
097 /**
098 * The dynamic programming matrix. Each position (i, j) represents the best score
099 * between a suffic of the firsts i characters of <CODE>seq1</CODE> and a suffix of
100 * the first j characters of <CODE>seq2</CODE>.
101 */
102 protected int[][] matrix;
103
104 /**
105 * Indicate the row of where an optimal local alignment can be found in the matrix..
106 */
107 protected int max_row;
108
109 /**
110 * Indicate the column of where an optimal local alignment can be found in the matrix.
111 */
112 protected int max_col;
113
114 /**
115 * Loads sequences into {@linkplain CharSequence} instances. In case of any error, an
116 * exception is raised by the constructor of <CODE>CharSequence</CODE> (please check
117 * the specification of that class for specific requirements).
118 *
119 * @param input1 Input for first sequence
120 * @param input2 Input for second sequence
121 * @throws IOException If an I/O error occurs when reading the sequences
122 * @throws InvalidSequenceException If the sequences are not valid
123 * @see CharSequence
124 */
125 protected void loadSequencesInternal (Reader input1, Reader input2)
126 throws IOException, InvalidSequenceException
127 {
128 // load sequences into instances of CharSequence
129 this.seq1 = new CharSequence(input1);
130 this.seq2 = new CharSequence(input2);
131 }
132
133 /**
134 * Frees pointers to loaded sequences and the dynamic programming matrix so that their
135 * data can be garbage collected.
136 */
137 protected void unloadSequencesInternal ()
138 {
139 this.seq1 = null;
140 this.seq2 = null;
141 this.matrix = null;
142 }
143
144 /**
145 * Builds an optimal local alignment between the loaded sequences after computing the
146 * dynamic programming matrix. It calls the <CODE>buildOptimalAlignment</CODE> method
147 * after the <CODE>computeMatrix</CODE> method computes the dynamic programming
148 * matrix.
149 *
150 * @return an optimal pairwise alignment between the loaded sequences
151 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
152 * with the loaded sequences.
153 * @see #computeMatrix
154 * @see #buildOptimalAlignment
155 */
156 protected PairwiseAlignment computePairwiseAlignment ()
157 throws IncompatibleScoringSchemeException
158 {
159 // compute the matrix
160 computeMatrix ();
161
162 // build and return an optimal local alignment
163 PairwiseAlignment alignment = buildOptimalAlignment ();
164
165 // allow the matrix to be garbage collected
166 matrix = null;
167
168 return alignment;
169 }
170
171 /**
172 * Computes the dynamic programming matrix.
173 *
174 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
175 * with the loaded sequences.
176 */
177 protected void computeMatrix () throws IncompatibleScoringSchemeException
178 {
179 int r, c, rows, cols, ins, sub, del, max_score;
180
181 rows = seq1.length()+1;
182 cols = seq2.length()+1;
183
184 matrix = new int [rows][cols];
185
186 // initiate first row
187 for (c = 0; c < cols; c++)
188 matrix[0][c] = 0;
189
190 // keep track of the maximum score
191 this.max_row = this.max_col = max_score = 0;
192
193 // calculates the similarity matrix (row-wise)
194 for (r = 1; r < rows; r++)
195 {
196 // initiate first column
197 matrix[r][0] = 0;
198
199 for (c = 1; c < cols; c++)
200 {
201 ins = matrix[r][c-1] + scoreInsertion(seq2.charAt(c));
202 sub = matrix[r-1][c-1] + scoreSubstitution(seq1.charAt(r),seq2.charAt(c));
203 del = matrix[r-1][c] + scoreDeletion(seq1.charAt(r));
204
205 // choose the greatest
206 matrix[r][c] = max (ins, sub, del, 0);
207
208 if (matrix[r][c] > max_score)
209 {
210 // keep track of the maximum score
211 max_score = matrix[r][c];
212 this.max_row = r; this.max_col = c;
213 }
214 }
215 }
216 }
217
218 /**
219 * Builds an optimal local alignment between the loaded sequences. Before it is
220 * executed, the dynamic programming matrix must already have been computed by
221 * the <CODE>computeMatrix</CODE> method.
222 *
223 * @return an optimal local alignment between the loaded sequences
224 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
225 * with the loaded sequences.
226 * @see #computeMatrix
227 */
228 protected PairwiseAlignment buildOptimalAlignment () throws
229 IncompatibleScoringSchemeException
230 {
231 StringBuffer gapped_seq1, score_tag_line, gapped_seq2;
232 int r, c, max_score, sub;
233
234 // start at the cell with maximum score
235 r = this.max_row;
236 c = this.max_col;
237
238 max_score = matrix[r][c];
239
240 gapped_seq1 = new StringBuffer();
241 score_tag_line = new StringBuffer();
242 gapped_seq2 = new StringBuffer();
243
244 while ((r > 0 || c > 0) && (matrix[r][c] > 0))
245 {
246 if (c > 0)
247 if (matrix[r][c] == matrix[r][c-1] + scoreInsertion(seq2.charAt(c)))
248 {
249 // insertion
250 gapped_seq1.insert (0, GAP_CHARACTER);
251 score_tag_line.insert (0, GAP_TAG);
252 gapped_seq2.insert (0, seq2.charAt(c));
253
254 c = c - 1;
255
256 // skip to the next iteration
257 continue;
258 }
259
260 if ((r > 0) && (c > 0))
261 {
262 sub = scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
263
264 if (matrix[r][c] == matrix[r-1][c-1] + sub)
265 {
266 // substitution
267 gapped_seq1.insert (0, seq1.charAt(r));
268 if (seq1.charAt(r) == seq2.charAt(c))
269 if (useMatchTag())
270 score_tag_line.insert (0, MATCH_TAG);
271 else
272 score_tag_line.insert (0, seq1.charAt(r));
273 else if (sub > 0)
274 score_tag_line.insert (0, APPROXIMATE_MATCH_TAG);
275 else
276 score_tag_line.insert (0, MISMATCH_TAG);
277 gapped_seq2.insert (0, seq2.charAt(c));
278
279 r = r - 1; c = c - 1;
280
281 // skip to the next iteration
282 continue;
283 }
284 }
285
286 // must be a deletion
287 gapped_seq1.insert (0, seq1.charAt(r));
288 score_tag_line.insert (0, GAP_TAG);
289 gapped_seq2.insert (0,GAP_CHARACTER);
290
291 r = r - 1;
292 }
293
294 return new PairwiseAlignment (gapped_seq1.toString(), score_tag_line.toString(),
295 gapped_seq2.toString(), max_score);
296 }
297
298 /**
299 * Computes the score of the best local alignment between the two sequences using the
300 * scoring scheme previously set. This method calculates the similarity value only
301 * (doesn't build the whole matrix so the alignment cannot be recovered, however it
302 * has the advantage of requiring O(n) space only).
303 *
304 * @return the score of the best local alignment between the loaded sequences
305 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
306 * with the loaded sequences.
307 */
308 protected int computeScore () throws IncompatibleScoringSchemeException
309 {
310 int[] array;
311 int rows = seq1.length()+1, cols = seq2.length()+1;
312 int r, c, tmp, ins, del, sub, max_score;
313
314 // keep track of the maximum score
315 max_score = 0;
316
317 if (rows <= cols)
318 {
319 // goes columnwise
320 array = new int [rows];
321
322 // initiate first column
323 for (r = 0; r < rows; r++)
324 array[r] = 0;
325
326 // calculate the similarity matrix (keep current column only)
327 for (c = 1; c < cols; c++)
328 {
329 // set first position to zero (tmp hold values
330 // that will be later moved to the array)
331 tmp = 0;
332
333 for (r = 1; r < rows; r++)
334 {
335 ins = array[r] + scoreInsertion(seq2.charAt(c));
336 sub = array[r-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
337 del = tmp + scoreDeletion(seq1.charAt(r));
338
339 // move the temp value to the array
340 array[r-1] = tmp;
341
342 // choose the greatest (or zero if all negative)
343 tmp = max (ins, sub, del, 0);
344
345 // keep track of the maximum score
346 if (tmp > max_score) max_score = tmp;
347 }
348
349 // move the temp value to the array
350 array[rows - 1] = tmp;
351 }
352 }
353 else
354 {
355 // goes rowwise
356 array = new int [cols];
357
358 // initiate first row
359 for (c = 0; c < cols; c++)
360 array[c] = 0;
361
362 // calculate the similarity matrix (keep current row only)
363 for (r = 1; r < rows; r++)
364 {
365 // set first position to zero (tmp hold values
366 // that will be later moved to the array)
367 tmp = 0;
368
369 for (c = 1; c < cols; c++)
370 {
371 ins = tmp + scoreInsertion(seq2.charAt(c));
372 sub = array[c-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
373 del = array[c] + scoreDeletion(seq1.charAt(r));
374
375 // move the temp value to the array
376 array[c-1] = tmp;
377
378 // choose the greatest (or zero if all negative)
379 tmp = max (ins, sub, del, 0);
380
381 // keep track of the maximum score
382 if (tmp > max_score) max_score = tmp;
383 }
384
385 // move the temp value to the array
386 array[cols - 1] = tmp;
387 }
388 }
389
390 return max_score;
391 }
392 }