{

  ApproxStrUtils, v0.1a

  The contents of this file are subject to the Mozilla Public License
  Version 1.1 (the "License"); you may not use this file except in
  compliance with the License. You may obtain a copy of the License at
  http://www.mozilla.org/MPL/

  Software distributed under the License is distributed on an "AS IS"
  basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
  License for the specific language governing rights and limitations
  under the License.

  The Original Code is ApproxStrUtils.

  The Initial Developer of the Original Code is Daniel Gaussmann,
  mail@gausi.de. Portions created by the Initial Developer are
  Copyright (C) 2008 the Initial Developer. All Rights Reserved.

  Contributor(s): (none yet)

  Alternatively, the contents of this file may be used under the terms
  of the GNU Lesser General Public License Version 2.1 or later
  (the  "LGPL"), in which case the provisions of LGPL are applicable
  instead of those above. If you wish to allow use of your version of
  this file only under the terms of the LGPL and not to allow others to use
  your version of this file under the MPL, indicate your decision by
  deleting the provisions above and replace them with the notice and
  other provisions required by the LGPL. If you do not delete
  the provisions above, a recipient may use your version of this file
  under either the MPL or the LGPL License.


  Changes in Version 0.1a
  -----------------------
  * replaced String with AnsiString
  * replaced Char with AnsiChar

}


unit ApproxStrUtils;

interface

uses SysUtils; // needed for AnsiUppercase

{ ApproxDistance computes the Edit-Distance (or Levenshtein-Distance) for the
  two given strings. This is the minmum count of operations needed to change
  one string into the other. Valid operations are:
    - Change a character
    - Delete a character
    - Insert a character. }

function ApproxDistance(const AText, AOther: AnsiString): Integer;

{ ApproxResemblesText returns true, if the Edit-Distance for the two given
  strings is at most MaxError. }

function ApproxResemblesText(const AText, AOther: AnsiString; MaxError: Integer = 1): Boolean;

{ ApproxContainsStr returns true if the subtext is found in the text with at most
  maxError errors according to the Edit-Distance. ApproxContainsStr is case-sensitive. }

function ApproxContainsStr(const AText, ASubText: AnsiString; MaxError: Integer = 1): Boolean;

{ ApproxContainsText returns true if the subtext is found in the text with at most
  maxError errors according to the Edit-Distance. ApproxContainsText is NOT case-sensitive. }

function ApproxContainsText(const AText, ASubText: AnsiString; MaxError: Integer = 1): Boolean;

{ ApproxPosEx returns the startposition of an appearance of the subtext in the text starting
  at position Offset with at most maxError errors according to the Edit-Distance.
  The output-paramter Count gives the number of characters in this appearance
  (e.g. for use in the copy-function). ApproxPosEx is NOT case-sensitive.

  ApproxPosEx tries to find a "good" appearance, i.e.:
  If there is an appearance with 0 errors, then around this appearance there are obviously
  two more with 1 error (deleting the character before/after the actual appearance).
  Although the first one-error-appearance will be found before the zero-error, ApproxPosEx
  tries to find a better appearance at the next position. If the next one is better, the
  search goes on for an even better one. If not, the previously one is returned. }

function ApproxPosEx(const AText, ASubText: AnsiString; Offset: Integer; Out Count: Integer; MaxError: Integer = 1): Integer;

{ ApproxPos is the same as ApproxPosEx with Offset = 1 }

function ApproxPos(const AText, ASubText: AnsiString; Out Count: Integer; MaxError: Integer = 1): Integer;

{ ApproxBestAppearance returns the startposition of the first Appearance with minimal
  errors according to the Edit-Distance.
  The output-parameters gives the number of characters in this appearance
  (e.g. for use in the copy-function) and the number of errors occured. }

function ApproxBestAppearance(const AText, ASubText: AnsiString; out Count, BestError: Integer): Integer;



implementation

type TBC_IntArray = Array[AnsiChar] of Integer;


// -------------------------------------------
// Some little functions needed in this Unit
// -------------------------------------------

{ Get the Minimum of 2 Integers}

function min(a,b: Integer): Integer;
begin
  if a <= b then
    result := a
  else
    result := b;
end;

{ Get the Minimum of 3 Integers}

function min3(a,b,c: Integer): Integer;
begin
  result := a;
  if b < result then result := b;
  if c < result then result := c;
end;


{ Get the maximum of 2 Integers }

function max(a,b: Integer): Integer;
begin
  if a >= b then
    result := a
  else
    result := b;
end;


{ ApproxStrUtils }

{ Compute the Edit-Distance by dynamic-programming with some pretty tricks }

function ApproxDistance(const AText, AOther: AnsiString): Integer;
var i, im, n, m: Integer;
    C: Array of Integer;
    pC, nC: Integer;
begin
  n := length(AText);
  m := length(AOther);

  if m = 0 then
    result := n
  else begin
      setlength(C, m+1);
      for i := 0 to m do C[i] := i;

      for i := 1 to n do
      begin
          pC := i-1;
          nC := i;
          for im  := 1 to m do
          begin
              if AText[i] = AOther[im] then
                nC := pC
              else
                nC := 1 + min3(nC, pC, C[im]);
              pC := C[im];
              C[im] := nC;
          end;
      end;
      result := C[m];
  end;
end;

function ApproxResemblesText(const AText, AOther: AnsiString; MaxError: Integer = 1): Boolean;
begin
  result := ApproxDistance(AText, AOther) <= MaxError;
end;


{ PreProcess_FilterCount initializes the character-array used ApproxContainsStr }

function PreProcess_FilterCount(const ASubText: AnsiString): TBC_IntArray;
var c: AnsiChar;
    m, i: Integer;
begin
  m := Length(ASubText);
  for c := Low(AnsiChar) to High(AnsiChar) do
    Result[c] := 0;
  for i := 1 to m do
    result[ASubText[i]] := result[ASubText[i]] + 1;
end;


{ ApproxContainsStr uses a Counting-Filter as described in (e.g.):
      Gonzalo Navarro
      Multiple approximate string matching by counting
      Proceedings of the 4th South American Workshop on String Processing (WSP'97)
      Carleton Univ. Press
      1997 }

function ApproxContainsStr(const AText, ASubText: AnsiString;  MaxError: Integer = 1): Boolean;
var n, m, IdxT, count: Integer;
    A: TBC_IntArray;
    lact, pC, nC, LastStop: Integer;
    C: Array of Integer;

    // Check Text in Searchwindow
    function Check(Start, Ende: Integer): Boolean;
    var i, IdxTCheck, s: Integer;
    begin
            result := False;
            if Start > LastStop then
            begin
                // Init new search via dynamic programming
                for i := 0 to m do C[i] := i;
                lact := min(m, MaxError + 1);
                s := start;
            end else
                s := LastStop + 1;
            // Continue with last search vie dynamic programming
            for IdxTCheck := s to Ende do
            begin
                pC := 0;
                nC := 0;
                for i := 1 to lact do
                begin
                    if ASubText[i] = AText[IdxTCheck] then
                      nC := pC
                    else
                      nC := 1 + min3(nC, pC, C[i]);
                    pC := C[i];
                    C[i] := nC;
                end;
                // get next "last active Cell"
                while C[lact] > MaxError do dec(lact);

                if lact = m then
                begin
                    result := True;
                    break;
                end else
                  inc(lact);
            end;
            LastStop := Ende;
    end;

begin
  n := length(AText);
  m := length(ASubText);

  if m = 0 then
  begin
      result := true;
  end else
  begin
      if m > n then
      begin
        result := ApproxResemblesText(AText, ASubText, MaxError);
      end else
      begin
            result := False;
            // Preprocessing
            A := PreProcess_FilterCount(ASubText);
            setlength(C, m+1);
            LastStop := -2;
            Count := 0;

            // Searching
            // Init Searchwindow
            for IdxT := 1 to min(m, n) do
            begin
                if A[AText[IdxT]] > 0 then inc(Count);
                A[AText[IdxT]] := A[AText[IdxT]] - 1;
            end;

            // check first Searchwindow
            If Count >= m - MaxError then
                result := Check(1, min(m, n));

            if not result then
                // Search the rest of the Text
                for IdxT := m+1 to n do
                begin
                    if A[AText[IdxT-m]] >= 0 then dec(Count);
                    A[AText[IdxT-m]] := A[AText[IdxT-m]] + 1;
                    if A[AText[IdxT]] > 0 then inc(Count);
                    A[AText[IdxT]] := A[AText[IdxT]] - 1;

                    // continue/start new check
                    If Count >= m - MaxError then
                      result := Check(IdxT - m + 1, IdxT);

                    if result then break;
                end;
      end;
  end;
end;


function ApproxContainsText(const AText, ASubText: AnsiString;  MaxError: Integer = 1): Boolean;
begin
  result := ApproxContainsStr(AnsiUpperCase(AText), AnsiUpperCase(ASubText), MaxError);
end;


{ Searches Backwards from a given Endposition of ASubText in AText to find the (or: a)
  startposition of the approximate appearance of the SubText with the given error.
  Result: number of characters in the approximate appearance

  This function is used by ApproxPosEx and ApproxBestAppearance.}

function SearchBackwards(const AText, ASubText: AnsiString; Offset: Integer; Err: Integer = 1): Integer;
var i, im, m: Integer;
    C: Array of Integer;
    pC, nC: Integer;
begin
  // just dynamic programming, but backwards
  // and with an exact error
  m := length(ASubText);
  setlength(C, m+1);
  for i := 0 to m do C[i] := i;
  result := -1;
  for i := Offset downto max(1,Offset - m - Err) do
  begin
      pC := 0;
      nC := 0;
      for im  := 1 to m do
      begin
          if AText[i] = ASubText[m-im+1] then
            nC := pC
          else
            nC := 1 + min3(nC, pC, C[im]);
          pC := C[im];
          C[im] := nC;
      end;
      if C[m] = err then
      begin
        result := Offset - i + 1;
        break;
      end;
  end;
end;


{ Searching the subtext in the text with dynamic-programming.
  Using the filteralgorithm as in ApproxContainsStr is possible but not used here
  to leave the code clearly arranged. }

function ApproxPosEx(const AText, ASubText: AnsiString; Offset: Integer; Out Count: Integer; MaxError: Integer = 1): Integer;
var m, n, lact, i, IdxT, pC, nC: Integer;
    C: Array of Integer;
    lastIdx, lastError: Integer;
begin
  n := length(AText);
  m := length(ASubText);

  // Init
  setlength(C, m+1);
  for i := 0 to m do C[i] := i;
  
  lact := min(m, MaxError + 1);

  result := 0;
  Count  := 0;
  lastIdx := -1;
  lastError := MaxError + 1;

  // Search
  for IdxT := Offset to n do
  begin
      pC := 0;
      nC := 0;
      for i := 1 to lact do
      begin
          if ASubText[i] = AText[IdxT] then
            nC := pC
          else
            nC := 1 + min3(nC, pC, C[i]);
          pC := C[i];
          C[i] := nC;
      end;
      // get next "last active Cell"
      while C[lact] > MaxError do dec(lact);

      if (lact = m) then
      begin
          // Found an appearance with at most maxError errors
          // ToDo: Find a "minimal-error-appearance" around the "current area"
          if (lastIdx = IdxT - 1) then
          begin
              // we are already checking for a "minimal-error-appearance" in the "current "area"
              if (C[lact] <= lastError) then
              begin
                  // we found a better apperance in the "current area"
                  lastIdx := idxT;
                  lastError := C[lact];
                  if lastIdx = n then
                  begin
                       Count := SearchBackwards(AText, ASubText, lastIdx, lastError);
                       result := lastIdx - Count + 1;
                       break;
                  end;
              end else
              begin
                  // Error is increasing - lastIdx was a "minimal-error-appearance"
                  Count := SearchBackwards(AText, ASubText, lastIdx, lastError);
                  result := lastIdx - Count + 1;
                  break;
              end;
          end else
          begin
              // we start a new check for a "minimal-error-appearance" in the "current "area"
              lastIdx := idxT;
              lastError := C[lact];
              if lastIdx = n then
              begin
                  Count := SearchBackwards(AText, ASubText, lastIdx, lastError);
                  result := lastIdx - Count + 1;
                  break;
              end;
          end;
      end else
      begin
        if (lastIdx = IdxT - 1) then
        begin
            // we ended a check for a "minimal-error-appearance" in the "current "area"
            // without finding a better/another one
            Count := SearchBackwards(AText, ASubText, lastIdx, lastError);
            result := lastIdx - Count + 1;
            break;
        end;
        inc(lact);
      end;
  end;
end;

function ApproxPos(const AText, ASubText: AnsiString; Out Count: Integer; MaxError: Integer = 1): Integer;
begin
  result := ApproxPosEx(AText, ASubText, 1, Count, MaxError);
end;


{ Searching for the best appearance with dynamic programming }

function ApproxBestAppearance(const AText, ASubText: AnsiString; out Count, BestError: Integer): Integer;
var i, im, n, m: Integer;
    C: Array of Integer;
    pC, nC: Integer;
begin
  n := length(AText);
  m := length(ASubText);
  if m = 0 then
  begin
      if n > 0 then
      begin
          Count := 1;
          BestError := 1;
          result := 1;
      end else
      begin
          Count := 0;
          BestError := 0;
          result := 0;
      end;
  end else
  begin
      if m > n then
      begin
        Count := n;
        result := min(1,n);
        BestError := ApproxDistance(AText, ASubText);
      end else
      begin
          setlength(C, m+1);
          for i := 0 to m do C[i] := i;

          BestError := m+1;
          Count := 0;
          result := 0;

          for i := 1 to n do
          begin
              pC := 0;
              nC := 0;
              for im  := 1 to m do
              begin
                  if AText[i] = ASubText[im] then
                    nC := pC
                  else
                    nC := 1 + min3(nC, pC, C[im]);
                  pC := C[im];
                  C[im] := nC;
              end;
              if C[m] < BestError then
              begin
                BestError := C[m];
                result := i;
              end;
          end;
          Count := SearchBackwards(AText, ASubText, result, BestError);
          result := result - Count + 1;
      end;
  end;
end;

end.
