diff --git a/src/editDistance.cc b/src/editDistance.cc
index fc75afd4..d49de20b 100644
--- a/src/editDistance.cc
+++ b/src/editDistance.cc
@@ -17,6 +17,7 @@ You should have received a copy of the GNU General Public License along with
this program; if not, see .
*/
+#include
#include
#include
#include
@@ -27,6 +28,13 @@ this program; if not, see .
#include
using namespace std;
+
+struct UniqueVecOut
+{
+ ColumnVector IA;
+ ColumnVector IC;
+ Cell IA_c;
+};
// Function for computing the minimum of three integer values
int minimum (int a, int b, int c)
@@ -37,114 +45,180 @@ int minimum (int a, int b, int c)
return min;
}
-
-// Function for computing the Levenshtein distance
-int LevensDist (const string& s1, const string s2)
+// Function for computing the Levenshtein distance between two strings
+int LevensDistStr (const string& s1, const string& s2)
{
- const int rows = s1.length() + 1;
- const int cols = s2.length() + 1;
- // Dynamically allocate distance matrix
- int **dist;
- dist = new int*[rows];
- for (int i = 0; i curr(cols+1, 0);
+ int prev;
+ // Prepopulate 1st column
+ for (int j = 0; j <= cols; j++)
{
- dist[i][0] = i;
- }
- for (int j = 0; j < cols; j++)
- {
- dist[0][j] = j;
+ curr[j] = j;
}
// Compute all other elements in distance matrix
- for (int i = 1; i < rows; i++)
+ for (int i = 1; i <= rows; i++)
{
- for (int j = 1; j < cols; j++)
+ prev = curr[0];
+ curr[0] = i;
+ for (int j = 1; j <= cols; j++)
{
+ int temp = curr[j];
if (s1[i - 1] == s2[j - 1])
{
- dist[i][j] = dist[i - 1][j - 1];
+ curr[j] = prev;
}
else
{
- dist[i][j] = minimum (dist[i - 1][j - 1] + 1, dist[i - 1][j] + 1,
- dist[i][j - 1] + 1);
+ curr[j] = 1 + minimum (prev, curr[j - 1], curr[j]);
}
+ prev = temp;
}
}
- int d = dist[rows - 1][cols - 1];
- // Free memory
- for(int i = 0; i < rows; i++)
- {
- delete dist[i];
- }
- delete dist;
- return d;
+ return curr[cols];
}
-// Function for comparing the Levenshtein distance against a minimum distance
-bool minLevensDist (const string& s1, const string& s2, int const minDist)
+// Function for computing the Levenshtein distance between two documents
+int LevensDistDoc (const Cell& d1, const Cell& d2)
{
- int rows = s1.length() + 1;
- int cols = s2.length() + 1;
- // Dynamically allocate distance matrix
- int **dist;
- dist = new int*[rows];
- for (int i = 0; i curr(cols+1, 0);
+ int prev;
+ // Prepopulate 1st column
+ for (int j = 0; j <= cols; j++)
{
- dist[0][j] = j;
+ curr[j] = j;
}
- // Compute all other elements in distance matrix until minDist is exceeded
- for (int i = 1; i < rows; i++)
+ // Compute all other elements in distance matrix
+ for (int i = 1; i <= rows; i++)
{
- for (int j = 1; j < cols; j++)
+ prev = curr[0];
+ curr[0] = i;
+ for (int j = 1; j <= cols; j++)
{
- if (s1[i - 1] == s2[j - 1])
+ int temp = curr[j];
+ if (d1(i - 1).string_value() == d2(i - 1).string_value())
{
- dist[i][j] = dist[i - 1][j - 1];
+ curr[j] = prev;
}
else
{
- dist[i][j] = minimum (dist[i - 1][j - 1] + 1, dist[i - 1][j] + 1,
- dist[i][j - 1] + 1);
- int d = dist[rows - 1][cols - 1];
- if (d > minDist)
- {
- // Free memory
- for(int i = 0; i < rows; i++)
- {
- delete dist[i];
- }
- delete dist;
- return false;
- }
- }
+ curr[j] = 1 + minimum (prev, curr[j - 1], curr[j]);
+ }
+ prev = temp;
}
}
- // Free memory
- for(int i = 0; i < rows; i++)
+ return curr[cols];
+}
+
+// Transform a distance triu matric to a boolean triu matrix
+boolMatrix double2bool (const Matrix& D, const int& minDist)
+{
+ const int sz = D.rows();
+ boolMatrix Bmat(sz, sz);
+ for (octave_idx_type i = 0; i < sz - 1; i++)
+ {
+ Bmat(i,i) = true;
+ for (octave_idx_type j = i + 1; j < sz; j++)
+ {
+ if (D(i,j) <= minDist)
+ {
+ Bmat(i,j) = true;
+ Bmat(j,i) = false;
+ }
+ else
+ {
+ Bmat(i,j) = false;
+ Bmat(j,i) = false;
+ }
+ }
+ }
+ Bmat(sz - 1,sz - 1) = true;
+ return Bmat;
+}
+
+// Transform a distance triu matric to a distance vector
+Matrix triu2Dvec (const Matrix& D)
+{
+ const int szA = D.rows();
+ const int sz = szA * (szA - 1) / 2;
+ octave_idx_type idx = 0;
+ Matrix Dvec(sz, 1);
+ for (octave_idx_type i = 0; i < szA - 1; i++)
+ {
+ for (octave_idx_type j = i + 1; j < szA; j++)
+ {
+ Dvec(idx++,0) = D(i,j);
+ }
+ }
+ return Dvec;
+}
+
+// Compute unique indexing IA cell of vectors
+vector> IAcellvec (const boolMatrix& B)
+{
+ int rows = B.rows();
+ int cols = B.columns();
+ vector> IAcell;
+ for (int i = 0; i < rows; i++)
+ {
+ vector IA_cIdx;
+ IA_cIdx.push_back(i);
+ for (int j = i + 1; j < cols; j++)
+ {
+ if (B(i,j))
+ {
+ IA_cIdx.push_back(j);
+ }
+ }
+ IAcell.push_back(IA_cIdx);
+ }
+ return IAcell;
+}
+
+// Compute unique indexing IA vector
+vector IAvector (const vector>& IAcell)
+{
+ vector IA;
+ vector IA_done;
+ for (int i = 0; i < IAcell.size(); i++)
{
- delete dist[i];
+ for (int j = 0; j < IAcell[i].size(); j++)
+ {
+ if (binary_search(IA_done.begin(), IA_done.end(), IAcell[i][j]))
+ {
+ break;
+ }
+ else
+ {
+ if (j == 0)
+ {
+ IA.push_back(IAcell[i][j]);
+ }
+ else
+ {
+ IA_done.push_back(IAcell[i][j]);
+ }
+ }
+ }
+ sort (IA_done.begin(), IA_done.end());
}
- delete dist;
- return true;
+ return IA;
}
DEFUN_DLD(editDistance, args, nargout,
"-*- texinfo -*-\n\
@deftypefn {statistics} {@var{d} =} editDistance (@var{str1})\n\
+ @deftypefnx {statistics} {@var{d} =} editDistance (@var{doc1})\n\
+ @deftypefnx {statistics} {@var{C} =} editDistance (@dots{}, @var{minDist})\n\
+ @deftypefnx {statistics} {[@var{C}, @var{IA}, @var{IC}] =} editDistance @\
+ (@dots{}, @var{minDist})\n\
+ @deftypefnx {statistics} {[@var{C}, @var{IA}, @var{IC}] =} editDistance @\
+ (@dots{}, @var{minDist}, @qcode{\"OutputAllIndices\"}, @qcode{true})\n\
+ @deftypefnx {statistics} {[@var{C}, @var{IA}, @var{IC}] =} editDistance @\
+ (@dots{}, @var{minDist}, @qcode{\"OutputAllIndices\"}, @qcode{false})\n\
@deftypefnx {statistics} {@var{d} =} editDistance (@var{str1}, @var{str2})\n\
@deftypefnx {statistics} {@var{d} =} editDistance (@dots{}, @var{minDist})\n\
\n\
@@ -172,6 +246,28 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\
@end deftypefn")
{
int nargin = args.length();
+ // Add default options
+ bool OutputAllIndices = false;
+ // Parse Name-Value paired arguments
+ if (nargin > 2 && args(nargin-2).is_string())
+ {
+ string ParamName = "OutputAllIndices";
+ if (args(nargin - 2).string_value() == ParamName)
+ {
+ const int idx = nargin - 1;
+ if (args(idx).islogical() && args(idx).numel() == 1)
+ {
+ const boolMatrix tmp = args(idx).bool_matrix_value();
+ OutputAllIndices = tmp(0);
+ }
+ else
+ {
+ error ("editDistance: value for OutputAllIndices must be a logical scalar.");
+ }
+ nargin--;
+ nargin--;
+ }
+ }
// Check for invalid number of input arguments
if (nargin > 3)
{
@@ -183,11 +279,11 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\
if (nargin > 1 && args(nargin-1).isnumeric())
{
// Check minDist input argument
- if (args(nargin-1).numel() != 1)
+ if (args(nargin -1 ).numel() != 1)
{
error ("editDistance: minDist must be a scalar value.");
}
- Matrix tmp = args(nargin-1).matrix_value();
+ Matrix tmp = args(nargin - 1).matrix_value();
if (tmp(0,0) < 0 || floor (tmp(0,0)) != tmp(0,0))
{
error ("editDistance: minDist must be a nonnegative integer.");
@@ -201,7 +297,7 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\
doMinDist = false;
}
// Check cases of string arguments
- octave_value_list retval;
+ octave_value_list retval (nargout);
if (nargin == 1)
{
if (args(0).iscellstr())
@@ -215,36 +311,160 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\
retval(0) = double (0);
return retval;
}
- // Compute the distance vector
- int sz = szA * (szA - 1) / 2;
- octave_idx_type idx = 0;
- if (doMinDist)
+ // Compute the edit distance
+ Matrix D(szA, szA);
+ #pragma omp parallel
{
- boolMatrix D(sz, 1);
#pragma omp parallel for
for (octave_idx_type i = 0; i < szA - 1; i++)
{
+ D(i,i) = 0;
string s1 = strA(i).string_value();
for (octave_idx_type j = i + 1; j < szA; j++)
{
- D(idx++,0) = minLevensDist (s1, strA(j).string_value(), minDist);
+ D(i,j) = LevensDistStr (s1, strA(j).string_value());
+ }
+ }
+ D(szA - 1,szA - 1) = 0;
+ }
+ // If minDist is given, revert functionality from 'pdist' to 'uniquetol'
+ // and return a vector indexing similar strings
+ if (doMinDist)
+ {
+ boolMatrix B = double2bool (D, minDist);
+ vector> IAc;
+ vector IAv;
+ IAc = IAcellvec (B);
+ IAv = IAvector (IAc);
+ // Build cellstr with unique elements
+ Cell C(IAv.size(), 1);
+ for (octave_idx_type i = 0; i < IAv.size(); i++)
+ {
+ C(i,0) = strA(IAv[i]).string_value();
+ }
+ retval(0) = C;
+ // Build IA vector output
+ if (nargout > 1)
+ {
+ if (OutputAllIndices)
+ {
+ Cell IA(IAv.size(), 1);
+ for (octave_idx_type i = 0; i < IAv.size(); i++)
+ {
+ int idx = IAv[i];
+ Matrix IAidx(IAc[idx].size(), 1);
+ for (octave_idx_type j = 0; j < IAc[idx].size(); j++)
+ {
+ IAidx(j,0) = IAc[idx][j] + 1;
+ }
+ IA(i,0) = IAidx;
+ }
+ retval(1) = IA;
+ }
+ else
+ {
+ Matrix IA(IAv.size(), 1);
+ for (octave_idx_type i = 0; i < IAv.size(); i++)
+ {
+ IA(i,0) = IAv[i] + 1;
+ }
+ retval(1) = IA;
}
}
- retval(0) = D;
}
else
{
- Matrix D(sz, 1);
+ // Transform to distance vector
+ retval(0) = triu2Dvec (D);
+ }
+ return retval;
+ }
+ else if (args(0).iscell())
+ {
+ // Get cellstr input argument
+ const Cell docA = args(0).cell_value();
+ int szA = docA.numel();
+ // Check that all cell elements contain cellstr arrays
+ for (octave_idx_type i = 0; i < szA - 1; i++)
+ {
+ Cell tmp = docA.elem(i);
+ if (! tmp.iscellstr())
+ {
+ error ("editDistance: tokenizedDocument "
+ "must contain cellstr arrays.");
+ }
+ }
+ // For scalar input return distance to itself, i.e. 0
+ if (szA == 1)
+ {
+ retval(0) = double (0);
+ return retval;
+ }
+ // Compute the edit distance
+ Matrix D(szA, szA);
+ #pragma omp parallel
+ {
#pragma omp parallel for
for (octave_idx_type i = 0; i < szA - 1; i++)
{
- string s1 = strA(i).string_value();
+ D(i,i) = 0;
+ Cell d1 = docA.elem(i);
for (octave_idx_type j = i + 1; j < szA; j++)
{
- D(idx++, 0) = LevensDist (s1, strA(j).string_value());
+ D(i,j) = LevensDistDoc (d1, docA.elem(j));
+ }
+ }
+ D(szA - 1,szA - 1) = 0;
+ }
+ // If minDist is given, revert functionality from 'pdist' to 'uniquetol'
+ // and return a vector indexing similar documents
+ if (doMinDist)
+ {
+ boolMatrix B = double2bool (D, minDist);
+ vector> IAc;
+ vector IAv;
+ IAc = IAcellvec (B);
+ IAv = IAvector (IAc);
+ // Build cellstr with unique elements
+ Cell C(IAv.size(), 1);
+ for (octave_idx_type i = 0; i < IAv.size(); i++)
+ {
+ C(i,0) = docA(IAv[i]).cell_value();
+ }
+ retval(0) = C;
+ // Build IA vector output
+ if (nargout > 1)
+ {
+ if (OutputAllIndices)
+ {
+ Cell IA(IAv.size(), 1);
+ for (octave_idx_type i = 0; i < IAv.size(); i++)
+ {
+ int idx = IAv[i];
+ Matrix IAidx(IAc[idx].size(), 1);
+ for (octave_idx_type j = 0; j < IAc[idx].size(); j++)
+ {
+ IAidx(j,0) = IAc[idx][j] + 1;
+ }
+ IA(i,0) = IAidx;
+ }
+ retval(1) = IA;
+ }
+ else
+ {
+ Matrix IA(IAv.size(), 1);
+ for (octave_idx_type i = 0; i < IAv.size(); i++)
+ {
+ IA(i,0) = IAv[i] + 1;
+ }
+ retval(1) = IA;
}
}
- retval(0) = D;
+ }
+ else
+ {
+ // Transform to distance vector
+ retval(0) = triu2Dvec (D);
}
return retval;
}
@@ -267,88 +487,79 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\
{
error ("editDistance: cellstr input arguments size mismatch.");
}
+ // Preallocate the distance vector
+ int szV = szA;
+ if (szA == 1)
+ {
+ szV = szB;
+ }
+ Matrix D(szV, 1);
// Compute the distance vector
if (szA == 1 && szB != 1)
{
- if (doMinDist)
- {
- boolMatrix D (szB, 1);
- for (octave_idx_type i = 0; i < szB; i++)
- {
- D(i,0) = minLevensDist (strA(0).string_value(),
- strB(i).string_value(), minDist);
- }
- retval(0) = D;
- }
- else
+ for (octave_idx_type i = 0; i < szB; i++)
{
- Matrix D (szB, 1);
- for (octave_idx_type i = 0; i < szB; i++)
- {
- D(i,0) = LevensDist (strA(0).string_value(),
- strB(i).string_value());
- }
- retval(0) = D;
+ D(i,0) = LevensDistStr (strA(0).string_value(),
+ strB(i).string_value());
}
+ retval(0) = D;
}
else if (szA != 1 && szB == 1)
{
- if (doMinDist)
+ for (octave_idx_type i = 0; i < szA; i++)
{
- boolMatrix D (szA, 1);
- for (octave_idx_type i = 0; i < szA; i++)
- {
- D(i,0) = minLevensDist (strA(i).string_value(),
- strB(0).string_value(), minDist);
- }
- retval(0) = D;
+ D(i,0) = LevensDistStr (strA(i).string_value(),
+ strB(0).string_value());
}
- else
+ retval(0) = D;
+ }
+ else
+ {
+ for (octave_idx_type i = 0; i < szA; i++)
{
- Matrix D (szA, 1);
- for (octave_idx_type i = 0; i < szA; i++)
- {
- D(i,0) = LevensDist (strA(i).string_value(),
- strB(0).string_value());
- }
- retval(0) = D;
+ D(i,0) = LevensDistStr (strA(i).string_value(),
+ strB(i).string_value());
}
+ retval(0) = D;
}
- else
+ if (doMinDist)
{
- if (doMinDist)
+ boolMatrix Bvec(szV, 1);
+ for (octave_idx_type i = 0; i < szV; i++)
{
- boolMatrix D (szA, 1);
- for (octave_idx_type i = 0; i < szA; i++)
+ if (D(i,0) <= minDist)
{
- D(i,0) = minLevensDist (strA(i).string_value(),
- strB(i).string_value(), minDist);
+ Bvec(i,0) = true;
}
- retval(0) = D;
- }
- else
- {
- Matrix D (szA, 1);
- for (octave_idx_type i = 0; i < szA; i++)
+ else
{
- D(i,0) = LevensDist (strA(i).string_value(),
- strB(i).string_value());
+ Bvec(i,0) = false;
}
- retval(0) = D;
}
+ retval(0) = Bvec;
+ }
+ else
+ {
+ retval(0) = D;
}
-
}
else if (args(0).is_string() && args(1).is_string())
{
+ int d = LevensDistStr (args(0).string_value(), args(1).string_value());
if (doMinDist)
{
- retval(0) = minLevensDist (args(0).string_value(),
- args(1).string_value(), minDist);
+ if (d <= minDist)
+ {
+ retval(0) = true;
+ }
+ else
+ {
+ retval(0) = false;
+ }
}
else
{
- retval(0) = LevensDist (args(0).string_value(), args(1).string_value());
+ retval(0) = d;
}
}
else
@@ -379,6 +590,10 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\
%! d = editDistance ("string1", "string2", -2);
%!error ...
%! d = editDistance ("string1", "string2", 1.25);
+%!error ...
+%! d = editDistance ({{"string1", "string2"}, 2});
+%!error ...
+%! d = editDistance ({{"string1", "string2"}, 2}, 2);
%!error ...
%! d = editDistance ([1, 2, 3]);
%!error ...
@@ -402,9 +617,26 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\
%! assert (d, [2; 1; 1]);
%! assert (class (d), "double");
%!test
-%! d = editDistance ({"AS","SD","AD"}, 1);
-%! assert (d, logical ([0; 1; 1]));
-%! assert (class (d), "logical");
+%! C = editDistance ({"AS","SD","AD"}, 1);
+%! assert (iscellstr (C), true);
+%! assert (C, {"AS";"SD"});
+%!test
+%! [C, IA] = editDistance ({"AS","SD","AD"}, 1);
+%! assert (class (IA), "double");
+%! assert (IA, [1;2]);
+%!test
+%! A = {"ASS"; "SDS"; "FDE"; "EDS"; "OPA"};
+%! [C, IA] = editDistance (A, 2, "OutputAllIndices", false);
+%! assert (class (IA), "double");
+%! assert (A(IA), C);
+%!test
+%! A = {"ASS"; "SDS"; "FDE"; "EDS"; "OPA"};
+%! [C, IA] = editDistance (A, 2, "OutputAllIndices", true);
+%! assert (class (IA), "cell");
+%! assert (C, {"ASS"; "FDE"; "OPA"});
+%! assert (A(IA{1}), {"ASS"; "SDS"; "EDS"});
+%! assert (A(IA{2}), {"FDE"; "EDS"});
+%! assert (A(IA{3}), {"OPA"});
%!test
%! d = editDistance ({"AS","SD","AD"}, {"AS", "AD", "SE"});
%! assert (d, [0; 1; 2]);
@@ -418,23 +650,23 @@ identifying as @qcode{true} the pairs that DO NOT exceed @var{minDist}. \n\n\
%! assert (d, [0; 2; 1]);
%! assert (class (d), "double");
%!test
-%! d = editDistance ({"AS","SD","AD"}, {"AS", "AD", "SE"}, 1);
-%! assert (d, logical ([1; 1; 0]));
-%! assert (class (d), "logical");
+%! b = editDistance ({"AS","SD","AD"}, {"AS", "AD", "SE"}, 1);
+%! assert (b, logical ([1; 1; 0]));
+%! assert (class (b), "logical");
%!test
-%! d = editDistance ({"AS","SD","AD"}, {"AS"}, 1);
-%! assert (d, logical ([1; 0; 1]));
-%! assert (class (d), "logical");
+%! b = editDistance ({"AS","SD","AD"}, {"AS"}, 1);
+%! assert (b, logical ([1; 0; 1]));
+%! assert (class (b), "logical");
%!test
-%! d = editDistance ({"AS"}, {"AS", "AD", "SE"}, 1);
-%! assert (d, logical ([1; 1; 0]));
-%! assert (class (d), "logical");
+%! b = editDistance ({"AS"}, {"AS", "AD", "SE"}, 1);
+%! assert (b, logical ([1; 1; 0]));
+%! assert (class (b), "logical");
%!test
-%! d = editDistance ("Octave", "octave");
-%! assert (d, 1);
-%! assert (class (d), "double");
+%! b = editDistance ("Octave", "octave");
+%! assert (b, 1);
+%! assert (class (b), "double");
%!test
-%! d = editDistance ("Octave", "octave", 1);
-%! assert (d, true);
-%! assert (class (d), "logical");
+%! b = editDistance ("Octave", "octave", 1);
+%! assert (b, true);
+%! assert (class (b), "logical");
*/