/** This class highlights text differences between two plain strings by generating html fragment to show changes using longest common subsequence algorithm
version 1.0, 24-11-2022, first release
version 1.1, 17-11-2023, Improved with Hirschberg algorithm to use linear amount of memory
*/
public class StringDiff2 {
private static final String INSERT_COLOR = "#00ff66";
private static final String DELETE_COLOR = "#ff9933";
private static final int lcs_threshold = 1;//minimum threshold for longest common subsequence
public static void main(String[] args) {
String text1 = "Do not change this section. Please check any misqelling! Note that this section is obsolete.";
String text2 = "New section added. Do not change this section. Please check any mispelling!";
String result = markTextDiff2(text1, text2, INSERT_COLOR, DELETE_COLOR);
System.out.println(result);
}
//highlights with htlm tags the changes from text1 to text2 using longest common subsequence algorithm
public static String markTextDiff2(String text1, String text2, String insertColor, String deleteColor) {
String lcs = longestCommonSubsequence(text1, text2);
StringBuilder stringBuilder = new StringBuilder();
int cur1 = 0, cur2 = 0;//cursors
for (int k = 0; k < lcs.length(); k++) {
char common = lcs.charAt(k);
int idx1 = text1.indexOf(common, cur1);
int idx2 = text2.indexOf(common, cur2);
if (idx1 > cur1) {
stringBuilder.append("").append(text1.substring(cur1, idx1)).append("");
}
if (idx2 > cur2) {
stringBuilder.append("").append(text2.substring(cur2, idx2)).append("");
}
stringBuilder.append(common);
cur1 = idx1 + 1;
cur2 = idx2 + 1;
}
if (cur1 < text1.length()) {
stringBuilder.append("").append(text1.substring(cur1)).append("");
}
if (cur2 < text2.length()) {
stringBuilder.append("").append(text2.substring(cur2)).append("");
}
return stringBuilder.toString();
}
/* This algorithm uses a quadratic amount of memory
public static String longestCommonSubsequence(String text1, String text2) {
//credit to: https://rosettacode.org/wiki/Longest_common_subsequence#Java
int[][] lengths = new int[text1.length() + 1][text2.length() + 1];
// row 0 and column 0 are initialized to 0 already
for (int i = 0; i < text1.length(); i++)
for (int j = 0; j < text2.length(); j++)
if (text1.charAt(i) == text2.charAt(j))
lengths[i+1][j+1] = lengths[i][j] + 1;
else
lengths[i+1][j+1] = Math.max(lengths[i+1][j], lengths[i][j+1]);
// get the substring out from the matrix
StringBuffer sb = new StringBuffer();
for (int x = text1.length(), y = text2.length(); x != 0 && y != 0; ) {
if (lengths[x][y] == lengths[x-1][y])
x--;
else if (lengths[x][y] == lengths[x][y-1])
y--;
else {
x--;
y--;
assert text1.charAt(x) == text2.charAt(y);
sb.append(text1.charAt(x));
}
}
return sb.reverse().toString();
}
*/
//Hirschberg algorithm, code is imported from package org.apache.commons.text.similarity, licensed to the Apache Software Foundation (ASF)
//The Hirschberg algorithm uses a linear amount of memory
//reference: https://commons.apache.org/proper/commons-text/index.html
private static String longestCommonSubsequence(final String left, final String right) {
final int m = left.length();
final int n = right.length();
final StringBuilder out = new StringBuilder();
if (m == 1) { // Handle trivial cases, as per the paper
if (right.contains(left)) //left contains one and only one character
out.append(left);
} else if (n > 0 && m > 1) {
final int mid = m / 2; // Find the middle point
final String leftFirstPart = left.substring(0, mid);
final String leftSecondPart = left.substring(mid, m);
// Step 3 of the algorithm: two calls to Algorithm B
final int[] l1 = algorithmB(leftFirstPart, right);
final int[] l2 = algorithmB(reverse(leftSecondPart), reverse(right));
// Find k, as per the Step 4 of the algorithm
int k = 0;
int t = 0;
for (int j = 0; j <= n; j++) {
final int s = l1[j] + l2[n - j];
if (t < s) {
t = s;
k = j;
}
}
// Step 5: solve simpler problems, recursively
out.append(longestCommonSubsequence(leftFirstPart, right.substring(0, k)));
out.append(longestCommonSubsequence(leftSecondPart, right.substring(k, n)));
}
return out.toString();
}
private static String reverse(final String s) {
return new StringBuilder(s).reverse().toString();
}
private static int[] algorithmB(final String left, final String right) {
final int m = left.length();
final int n = right.length();
// Creating an array for storing two rows of DP table
final int[][] dpRows = new int[2][1 + n];
for (int i = 1; i <= m; i++) {
// K(0, j) <- K(1, j) [j = 0...n], as per the paper:
// Since we have references in Java, we can swap references instead of literal copying.
// We could also use a "binary index" using modulus operator, but directly swapping the
// two rows helps readability and keeps the code consistent with the algorithm description
// in the paper.
final int[] temp = dpRows[0];
dpRows[0] = dpRows[1];
dpRows[1] = temp;
for (int j = 1; j <= n; j++) {
if (left.charAt(i - 1) == right.charAt(j - 1)) {
dpRows[1][j] = dpRows[0][j - 1] + 1;
} else {
dpRows[1][j] = Math.max(dpRows[1][j - 1], dpRows[0][j]);
}
}
}
// LL(j) <- K(1, j) [j=0...n], as per the paper:
// We don't need literal copying of the array, we can just return the reference
return dpRows[1];
}
}