diff --git a/src/editDistance.cc b/src/editDistance.cc index fc75afd4..d49de20b 100644 --- a/src/editDistance.cc +++ b/src/editDistance.cc @@ -17,6 +17,7 @@ You should have received a copy of the GNU General Public License along with this program; if not, see . */ +#include #include #include #include @@ -27,6 +28,13 @@ this program; if not, see . #include using namespace std; + +struct UniqueVecOut +{ + ColumnVector IA; + ColumnVector IC; + Cell IA_c; +}; // Function for computing the minimum of three integer values int minimum (int a, int b, int c) @@ -37,114 +45,180 @@ int minimum (int a, int b, int c) return min; } - -// Function for computing the Levenshtein distance -int LevensDist (const string& s1, const string s2) +// Function for computing the Levenshtein distance between two strings +int LevensDistStr (const string& s1, const string& s2) { - const int rows = s1.length() + 1; - const int cols = s2.length() + 1; - // Dynamically allocate distance matrix - int **dist; - dist = new int*[rows]; - for (int i = 0; i curr(cols+1, 0); + int prev; + // Prepopulate 1st column + for (int j = 0; j <= cols; j++) { - dist[i][0] = i; - } - for (int j = 0; j < cols; j++) - { - dist[0][j] = j; + curr[j] = j; } // Compute all other elements in distance matrix - for (int i = 1; i < rows; i++) + for (int i = 1; i <= rows; i++) { - for (int j = 1; j < cols; j++) + prev = curr[0]; + curr[0] = i; + for (int j = 1; j <= cols; j++) { + int temp = curr[j]; if (s1[i - 1] == s2[j - 1]) { - dist[i][j] = dist[i - 1][j - 1]; + curr[j] = prev; } else { - dist[i][j] = minimum (dist[i - 1][j - 1] + 1, dist[i - 1][j] + 1, - dist[i][j - 1] + 1); + curr[j] = 1 + minimum (prev, curr[j - 1], curr[j]); } + prev = temp; } } - int d = dist[rows - 1][cols - 1]; - // Free memory - for(int i = 0; i < rows; i++) - { - delete dist[i]; - } - delete dist; - return d; + return curr[cols]; } -// Function for comparing the Levenshtein distance against a minimum distance -bool minLevensDist (const string& s1, const string& s2, int const minDist) +// Function for computing the Levenshtein distance between two documents +int LevensDistDoc (const Cell& d1, const Cell& d2) { - int rows = s1.length() + 1; - int cols = s2.length() + 1; - // Dynamically allocate distance matrix - int **dist; - dist = new int*[rows]; - for (int i = 0; i curr(cols+1, 0); + int prev; + // Prepopulate 1st column + for (int j = 0; j <= cols; j++) { - dist[0][j] = j; + curr[j] = j; } - // Compute all other elements in distance matrix until minDist is exceeded - for (int i = 1; i < rows; i++) + // Compute all other elements in distance matrix + for (int i = 1; i <= rows; i++) { - for (int j = 1; j < cols; j++) + prev = curr[0]; + curr[0] = i; + for (int j = 1; j <= cols; j++) { - if (s1[i - 1] == s2[j - 1]) + int temp = curr[j]; + if (d1(i - 1).string_value() == d2(i - 1).string_value()) { - dist[i][j] = dist[i - 1][j - 1]; + curr[j] = prev; } else { - dist[i][j] = minimum (dist[i - 1][j - 1] + 1, dist[i - 1][j] + 1, - dist[i][j - 1] + 1); - int d = dist[rows - 1][cols - 1]; - if (d > minDist) - { - // Free memory - for(int i = 0; i < rows; i++) - { - delete dist[i]; - } - delete dist; - return false; - } - } + curr[j] = 1 + minimum (prev, curr[j - 1], curr[j]); + } + prev = temp; } } - // Free memory - for(int i = 0; i < rows; i++) + return curr[cols]; +} + +// Transform a distance triu matric to a boolean triu matrix +boolMatrix double2bool (const Matrix& D, const int& minDist) +{ + const int sz = D.rows(); + boolMatrix Bmat(sz, sz); + for (octave_idx_type i = 0; i < sz - 1; i++) + { + Bmat(i,i) = true; + for (octave_idx_type j = i + 1; j < sz; j++) + { + if (D(i,j) <= minDist) + { + Bmat(i,j) = true; + Bmat(j,i) = false; + } + else + { + Bmat(i,j) = false; + Bmat(j,i) = false; + } + } + } + Bmat(sz - 1,sz - 1) = true; + return Bmat; +} + +// Transform a distance triu matric to a distance vector +Matrix triu2Dvec (const Matrix& D) +{ + const int szA = D.rows(); + const int sz = szA * (szA - 1) / 2; + octave_idx_type idx = 0; + Matrix Dvec(sz, 1); + for (octave_idx_type i = 0; i < szA - 1; i++) + { + for (octave_idx_type j = i + 1; j < szA; j++) + { + Dvec(idx++,0) = D(i,j); + } + } + return Dvec; +} + +// Compute unique indexing IA cell of vectors +vector> IAcellvec (const boolMatrix& B) +{ + int rows = B.rows(); + int cols = B.columns(); + vector> IAcell; + for (int i = 0; i < rows; i++) + { + vector IA_cIdx; + IA_cIdx.push_back(i); + for (int j = i + 1; j < cols; j++) + { + if (B(i,j)) + { + IA_cIdx.push_back(j); + } + } + IAcell.push_back(IA_cIdx); + } + return IAcell; +} + +// Compute unique indexing IA vector +vector IAvector (const vector>& IAcell) +{ + vector IA; + vector IA_done; + for (int i = 0; i < IAcell.size(); i++) { - delete dist[i]; + for (int j = 0; j < IAcell[i].size(); j++) + { + if (binary_search(IA_done.begin(), IA_done.end(), IAcell[i][j])) + { + break; + } + else + { + if (j == 0) + { + IA.push_back(IAcell[i][j]); + } + else + { + IA_done.push_back(IAcell[i][j]); + } + } + } + sort (IA_done.begin(), IA_done.end()); } - delete dist; - return true; + return IA; } DEFUN_DLD(editDistance, args, nargout, "-*- texinfo -*-\n\ @deftypefn {statistics} {@var{d} =} editDistance (@var{str1})\n\ + @deftypefnx {statistics} {@var{d} =} editDistance (@var{doc1})\n\ + @deftypefnx {statistics} {@var{C} =} editDistance (@dots{}, @var{minDist})\n\ + @deftypefnx {statistics} {[@var{C}, @var{IA}, @var{IC}] =} editDistance @\ + (@dots{}, @var{minDist})\n\ + @deftypefnx {statistics} {[@var{C}, @var{IA}, @var{IC}] =} editDistance @\ + (@dots{}, @var{minDist}, @qcode{\"OutputAllIndices\"}, @qcode{true})\n\ + @deftypefnx {statistics} {[@var{C}, @var{IA}, @var{IC}] =} editDistance @\ + (@dots{}, @var{minDist}, @qcode{\"OutputAllIndices\"}, @qcode{false})\n\ @deftypefnx {statistics} {@var{d} =} editDistance (@var{str1}, @var{str2})\n\ @deftypefnx {statistics} {@var{d} =} editDistance (@dots{}, @var{minDist})\n\ \n\ @@ -172,6 +246,28 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\ @end deftypefn") { int nargin = args.length(); + // Add default options + bool OutputAllIndices = false; + // Parse Name-Value paired arguments + if (nargin > 2 && args(nargin-2).is_string()) + { + string ParamName = "OutputAllIndices"; + if (args(nargin - 2).string_value() == ParamName) + { + const int idx = nargin - 1; + if (args(idx).islogical() && args(idx).numel() == 1) + { + const boolMatrix tmp = args(idx).bool_matrix_value(); + OutputAllIndices = tmp(0); + } + else + { + error ("editDistance: value for OutputAllIndices must be a logical scalar."); + } + nargin--; + nargin--; + } + } // Check for invalid number of input arguments if (nargin > 3) { @@ -183,11 +279,11 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\ if (nargin > 1 && args(nargin-1).isnumeric()) { // Check minDist input argument - if (args(nargin-1).numel() != 1) + if (args(nargin -1 ).numel() != 1) { error ("editDistance: minDist must be a scalar value."); } - Matrix tmp = args(nargin-1).matrix_value(); + Matrix tmp = args(nargin - 1).matrix_value(); if (tmp(0,0) < 0 || floor (tmp(0,0)) != tmp(0,0)) { error ("editDistance: minDist must be a nonnegative integer."); @@ -201,7 +297,7 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\ doMinDist = false; } // Check cases of string arguments - octave_value_list retval; + octave_value_list retval (nargout); if (nargin == 1) { if (args(0).iscellstr()) @@ -215,36 +311,160 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\ retval(0) = double (0); return retval; } - // Compute the distance vector - int sz = szA * (szA - 1) / 2; - octave_idx_type idx = 0; - if (doMinDist) + // Compute the edit distance + Matrix D(szA, szA); + #pragma omp parallel { - boolMatrix D(sz, 1); #pragma omp parallel for for (octave_idx_type i = 0; i < szA - 1; i++) { + D(i,i) = 0; string s1 = strA(i).string_value(); for (octave_idx_type j = i + 1; j < szA; j++) { - D(idx++,0) = minLevensDist (s1, strA(j).string_value(), minDist); + D(i,j) = LevensDistStr (s1, strA(j).string_value()); + } + } + D(szA - 1,szA - 1) = 0; + } + // If minDist is given, revert functionality from 'pdist' to 'uniquetol' + // and return a vector indexing similar strings + if (doMinDist) + { + boolMatrix B = double2bool (D, minDist); + vector> IAc; + vector IAv; + IAc = IAcellvec (B); + IAv = IAvector (IAc); + // Build cellstr with unique elements + Cell C(IAv.size(), 1); + for (octave_idx_type i = 0; i < IAv.size(); i++) + { + C(i,0) = strA(IAv[i]).string_value(); + } + retval(0) = C; + // Build IA vector output + if (nargout > 1) + { + if (OutputAllIndices) + { + Cell IA(IAv.size(), 1); + for (octave_idx_type i = 0; i < IAv.size(); i++) + { + int idx = IAv[i]; + Matrix IAidx(IAc[idx].size(), 1); + for (octave_idx_type j = 0; j < IAc[idx].size(); j++) + { + IAidx(j,0) = IAc[idx][j] + 1; + } + IA(i,0) = IAidx; + } + retval(1) = IA; + } + else + { + Matrix IA(IAv.size(), 1); + for (octave_idx_type i = 0; i < IAv.size(); i++) + { + IA(i,0) = IAv[i] + 1; + } + retval(1) = IA; } } - retval(0) = D; } else { - Matrix D(sz, 1); + // Transform to distance vector + retval(0) = triu2Dvec (D); + } + return retval; + } + else if (args(0).iscell()) + { + // Get cellstr input argument + const Cell docA = args(0).cell_value(); + int szA = docA.numel(); + // Check that all cell elements contain cellstr arrays + for (octave_idx_type i = 0; i < szA - 1; i++) + { + Cell tmp = docA.elem(i); + if (! tmp.iscellstr()) + { + error ("editDistance: tokenizedDocument " + "must contain cellstr arrays."); + } + } + // For scalar input return distance to itself, i.e. 0 + if (szA == 1) + { + retval(0) = double (0); + return retval; + } + // Compute the edit distance + Matrix D(szA, szA); + #pragma omp parallel + { #pragma omp parallel for for (octave_idx_type i = 0; i < szA - 1; i++) { - string s1 = strA(i).string_value(); + D(i,i) = 0; + Cell d1 = docA.elem(i); for (octave_idx_type j = i + 1; j < szA; j++) { - D(idx++, 0) = LevensDist (s1, strA(j).string_value()); + D(i,j) = LevensDistDoc (d1, docA.elem(j)); + } + } + D(szA - 1,szA - 1) = 0; + } + // If minDist is given, revert functionality from 'pdist' to 'uniquetol' + // and return a vector indexing similar documents + if (doMinDist) + { + boolMatrix B = double2bool (D, minDist); + vector> IAc; + vector IAv; + IAc = IAcellvec (B); + IAv = IAvector (IAc); + // Build cellstr with unique elements + Cell C(IAv.size(), 1); + for (octave_idx_type i = 0; i < IAv.size(); i++) + { + C(i,0) = docA(IAv[i]).cell_value(); + } + retval(0) = C; + // Build IA vector output + if (nargout > 1) + { + if (OutputAllIndices) + { + Cell IA(IAv.size(), 1); + for (octave_idx_type i = 0; i < IAv.size(); i++) + { + int idx = IAv[i]; + Matrix IAidx(IAc[idx].size(), 1); + for (octave_idx_type j = 0; j < IAc[idx].size(); j++) + { + IAidx(j,0) = IAc[idx][j] + 1; + } + IA(i,0) = IAidx; + } + retval(1) = IA; + } + else + { + Matrix IA(IAv.size(), 1); + for (octave_idx_type i = 0; i < IAv.size(); i++) + { + IA(i,0) = IAv[i] + 1; + } + retval(1) = IA; } } - retval(0) = D; + } + else + { + // Transform to distance vector + retval(0) = triu2Dvec (D); } return retval; } @@ -267,88 +487,79 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\ { error ("editDistance: cellstr input arguments size mismatch."); } + // Preallocate the distance vector + int szV = szA; + if (szA == 1) + { + szV = szB; + } + Matrix D(szV, 1); // Compute the distance vector if (szA == 1 && szB != 1) { - if (doMinDist) - { - boolMatrix D (szB, 1); - for (octave_idx_type i = 0; i < szB; i++) - { - D(i,0) = minLevensDist (strA(0).string_value(), - strB(i).string_value(), minDist); - } - retval(0) = D; - } - else + for (octave_idx_type i = 0; i < szB; i++) { - Matrix D (szB, 1); - for (octave_idx_type i = 0; i < szB; i++) - { - D(i,0) = LevensDist (strA(0).string_value(), - strB(i).string_value()); - } - retval(0) = D; + D(i,0) = LevensDistStr (strA(0).string_value(), + strB(i).string_value()); } + retval(0) = D; } else if (szA != 1 && szB == 1) { - if (doMinDist) + for (octave_idx_type i = 0; i < szA; i++) { - boolMatrix D (szA, 1); - for (octave_idx_type i = 0; i < szA; i++) - { - D(i,0) = minLevensDist (strA(i).string_value(), - strB(0).string_value(), minDist); - } - retval(0) = D; + D(i,0) = LevensDistStr (strA(i).string_value(), + strB(0).string_value()); } - else + retval(0) = D; + } + else + { + for (octave_idx_type i = 0; i < szA; i++) { - Matrix D (szA, 1); - for (octave_idx_type i = 0; i < szA; i++) - { - D(i,0) = LevensDist (strA(i).string_value(), - strB(0).string_value()); - } - retval(0) = D; + D(i,0) = LevensDistStr (strA(i).string_value(), + strB(i).string_value()); } + retval(0) = D; } - else + if (doMinDist) { - if (doMinDist) + boolMatrix Bvec(szV, 1); + for (octave_idx_type i = 0; i < szV; i++) { - boolMatrix D (szA, 1); - for (octave_idx_type i = 0; i < szA; i++) + if (D(i,0) <= minDist) { - D(i,0) = minLevensDist (strA(i).string_value(), - strB(i).string_value(), minDist); + Bvec(i,0) = true; } - retval(0) = D; - } - else - { - Matrix D (szA, 1); - for (octave_idx_type i = 0; i < szA; i++) + else { - D(i,0) = LevensDist (strA(i).string_value(), - strB(i).string_value()); + Bvec(i,0) = false; } - retval(0) = D; } + retval(0) = Bvec; + } + else + { + retval(0) = D; } - } else if (args(0).is_string() && args(1).is_string()) { + int d = LevensDistStr (args(0).string_value(), args(1).string_value()); if (doMinDist) { - retval(0) = minLevensDist (args(0).string_value(), - args(1).string_value(), minDist); + if (d <= minDist) + { + retval(0) = true; + } + else + { + retval(0) = false; + } } else { - retval(0) = LevensDist (args(0).string_value(), args(1).string_value()); + retval(0) = d; } } else @@ -379,6 +590,10 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\ %! d = editDistance ("string1", "string2", -2); %!error ... %! d = editDistance ("string1", "string2", 1.25); +%!error ... +%! d = editDistance ({{"string1", "string2"}, 2}); +%!error ... +%! d = editDistance ({{"string1", "string2"}, 2}, 2); %!error ... %! d = editDistance ([1, 2, 3]); %!error ... @@ -402,9 +617,26 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\ %! assert (d, [2; 1; 1]); %! assert (class (d), "double"); %!test -%! d = editDistance ({"AS","SD","AD"}, 1); -%! assert (d, logical ([0; 1; 1])); -%! assert (class (d), "logical"); +%! C = editDistance ({"AS","SD","AD"}, 1); +%! assert (iscellstr (C), true); +%! assert (C, {"AS";"SD"}); +%!test +%! [C, IA] = editDistance ({"AS","SD","AD"}, 1); +%! assert (class (IA), "double"); +%! assert (IA, [1;2]); +%!test +%! A = {"ASS"; "SDS"; "FDE"; "EDS"; "OPA"}; +%! [C, IA] = editDistance (A, 2, "OutputAllIndices", false); +%! assert (class (IA), "double"); +%! assert (A(IA), C); +%!test +%! A = {"ASS"; "SDS"; "FDE"; "EDS"; "OPA"}; +%! [C, IA] = editDistance (A, 2, "OutputAllIndices", true); +%! assert (class (IA), "cell"); +%! assert (C, {"ASS"; "FDE"; "OPA"}); +%! assert (A(IA{1}), {"ASS"; "SDS"; "EDS"}); +%! assert (A(IA{2}), {"FDE"; "EDS"}); +%! assert (A(IA{3}), {"OPA"}); %!test %! d = editDistance ({"AS","SD","AD"}, {"AS", "AD", "SE"}); %! assert (d, [0; 1; 2]); @@ -418,23 +650,23 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\ %! assert (d, [0; 2; 1]); %! assert (class (d), "double"); %!test -%! d = editDistance ({"AS","SD","AD"}, {"AS", "AD", "SE"}, 1); -%! assert (d, logical ([1; 1; 0])); -%! assert (class (d), "logical"); +%! b = editDistance ({"AS","SD","AD"}, {"AS", "AD", "SE"}, 1); +%! assert (b, logical ([1; 1; 0])); +%! assert (class (b), "logical"); %!test -%! d = editDistance ({"AS","SD","AD"}, {"AS"}, 1); -%! assert (d, logical ([1; 0; 1])); -%! assert (class (d), "logical"); +%! b = editDistance ({"AS","SD","AD"}, {"AS"}, 1); +%! assert (b, logical ([1; 0; 1])); +%! assert (class (b), "logical"); %!test -%! d = editDistance ({"AS"}, {"AS", "AD", "SE"}, 1); -%! assert (d, logical ([1; 1; 0])); -%! assert (class (d), "logical"); +%! b = editDistance ({"AS"}, {"AS", "AD", "SE"}, 1); +%! assert (b, logical ([1; 1; 0])); +%! assert (class (b), "logical"); %!test -%! d = editDistance ("Octave", "octave"); -%! assert (d, 1); -%! assert (class (d), "double"); +%! b = editDistance ("Octave", "octave"); +%! assert (b, 1); +%! assert (class (b), "double"); %!test -%! d = editDistance ("Octave", "octave", 1); -%! assert (d, true); -%! assert (class (d), "logical"); +%! b = editDistance ("Octave", "octave", 1); +%! assert (b, true); +%! assert (class (b), "logical"); */