[Back to TEXTFILE SWAG index]  [Back to Main SWAG index]  [Original]

{$A+,B-,D-,E-,F-,G-,I-,L-,N-,O-,P-,Q-,R-,S-,T-,V-,X-,Y-}
Program Dup;
   { delete duplicate lines from a sorted text file }
   { Dup file1 file2 }

(* Author: Eddy Thilleman
   Donated to the public domain *)

(* {$DEFINE NoPlus} *)
(* uncomment the above line if you want to remove lines terminated
   by '+' characters *)

Uses
  Dos;

Type
  string3 = string[3];
Const
  WhiteSpace : string3 = #00#09#255;

Const
  NoFAttr : word = $1C; { attributen dir, volume, system }
  FAttr   : word = $23; { readonly-, hidden-, archive attributen }
  BufSize = 16384;      { buffersize 16 KB }
  divisor =  1000;

Type
  BufType = array [1..BufSize] of char;

Var
  Fname1, Fname2   : string;
  Line1, Line2     : string;
  OldFile, NewFile : text;
  OldBuf , NewBuf  : BufType;
  tel              : longint;


function OpenTextFile( var InF: text; name: string; var buffer: BufType ): boolean;
begin
  Assign( InF, Name );
  SetTextBuf( InF, buffer );
  Reset( InF );
  OpenTextFile := ( IOResult = 0 );
end { OpenTextFile };

function CreateTextFile( var OutF: text; name: string; var buffer: BufType ): boolean;
begin
  Assign( OutF, Name );
  SetTextBuf( OutF, buffer );
  Rewrite( OutF );
  CreateTextFile := ( IOResult = 0 );
end { CreateTextFile };


function FileExist( var FName : string ) : Boolean;
var
  F    : file;
  Attr : Word;
begin
  Assign( F, FName );
  GetFAttr( F, Attr );
  if DosError = 0 then
    FileExist := ( ( Attr and NoFAttr ) = 0 )
    { not dir-, volume- or system bit? }
  else
    FileExist := False;            { DosError }
  {}
end;


procedure StrCopy( var Str1, Str2: string ); assembler;
  { copy str1 to str2 }
asm
        LDS   SI,Str1    { load in DS:SI pointer to str1 }
        CLD              { string operations forward     }
        LES   DI,Str2    { load in ES:DI pointer to str2 }
        XOR   CH,CH      { clear CH                      }
        MOV   CL,[SI]    { length str1 --> CX            }
        INC   CX         { include length byte           }
    REP MOVSB            { copy str1 to str2             }
end  { StrCopy };


function CompUCStr( var Str1, Str2: String ): ShortInt; Assembler;
  { Compare Str1 and Str2 case insensitive }
asm     mov   dx, ds                 { save ds                        }
        lds   si, str1               { ds:si = @str1                  }
        les   di, str2               { es:di = @str2                  }
        cld                          { string operations forwards     }
        lodsb                        { get length string1 in AL       }
        mov   ah, es:[di]            { get length string2 in AH       }
        inc   di
        mov   bx, ax                 { save both lengths in BX        }
        xor   cx, cx                 { clear cx                       }
        mov   cl, al                 { get length String1 in CX       }
        cmp   cl, ah                 { equal to length String2?       }
        jb    @len                   { CX stores minimum length       }
        mov   cl, ah                 { of string1 and string2         }
  @len: jcxz  @exit                  { quit if null                   }

 @loop: lodsb                        { str1[i] in AL                  }
        mov   ah, es:[di]            { str2[i] in AH                  }

        cmp   al, 'a'                { uppercase if 'a'..'z'          }
        jb    @1
        cmp   al, 'z'
        ja    @1
        sub   al, 20h

    @1: cmp   ah, 'a'                { uppercase if 'a'..'z'          }
        jb    @2
        cmp   ah, 'z'
        ja    @2
        sub   ah, 20h

    @2: cmp   al, ah                 { compare str1 to str2           }
        jne   @not                   { loop if equal                  }
        inc   di                     { next char str2                 }
        dec   cx                     { countdown                      }
        jcxz  @exit                  { strings same, Length also?     }
        jmp   @loop                  { go do next char                }

  @not: mov   bx, ax                 { BL = AL = String1[i],
                                       BH = AH = String2[i]           }
 @exit: xor   ax, ax
        cmp   bl, bh                 { length or contents comp        }
        je    @equal                 { str1 = str2: return  0         }
        jb    @lower                 { str1 < str2: return -1         }
        inc   ax                     { str1 > str2: return  1         }
        inc   ax
@lower: dec   ax
@equal: mov   ds, dx                 { restore ds                     }
end   { CompUCStr };


procedure White2Space( var Str: string; const WhiteSpace: string ); assembler;
  { replace white space chars in Str by spaces
    the string WhiteSpace contains the chars to replace }
asm     { setup }
        cld                      { string operations forwards    }
        les   di, str            { ES:DI points to Str           }
        xor   cx, cx             { clear cx                      }
        mov   cl, [di]           { length Str in cl              }
        jcxz  @exit              { if length of Str = 0, exit    }
        inc   di                 { point to 1st char of Str      }
        mov   dx, cx             { store length of Str           }
        mov   bx, di             { pointer to Str                }
        lds   si, WhiteSpace     { DS:SI points to WhiteSpace    }
        mov   ah, [si]           { load length of WhiteSpace     }

@start: cmp   ah, 0              { more chars WhiteSpace left?   }
        jz    @exit              { no, exit                      }
        inc   si                 { point to next char WhiteSpace }
        mov   al, [si]           { next char to hunt             }
        dec   ah                 { ah counting down              }
        xor   dh, dh             { clear dh                      }
        mov   cx, dx             { restore length of Str         }
        mov   di, bx             { restore pointer to Str        }
        mov   dh, ' '            { space char                    }
@scan:
  repne scasb                    { the hunt is on                }
        jnz   @next              { white space found?            }
        mov   [di-1], dh         { yes, replace that one         }
@next:  jcxz  @start             { if no more chars in Str       }
        jmp   @scan              { if more chars in Str          }
@exit:
end  { White2Space };


procedure RTrim( var Str: string ); assembler;
  { remove trailing spaces from str }
asm     { setup }
        std                      { string operations backwards   }
        les   di, str            { ES:DI points to Str           }
        xor   cx, cx             { clear cx                      }
        mov   cl, [di]           { length Str in cl              }
        jcxz  @exit              { if length of Str = 0, exit    }
        mov   bx, di             { bx points to Str              }
        add   di, cx             { start with last char in Str   }
        mov   al, ' '            { hunt for spaces               }

        { remove trailing spaces }
   repe scasb                    { the hunt is on                }
        jz    @null              { only spaces?                  }
        inc   cx                 { no, don't lose last char      }
@null:  mov   [bx], cl           { overwrite length byte of Str  }
@exit:
end  { RTrim };


procedure LTrim( var Str: string ); assembler;
  { remove leading spaces from str }
asm     { setup }
        cld                      { string operations forward          }
        lds   si, str            { DS:SI points to Str                }
        xor   cx, cx             { clear cx                           }
        mov   cl, [si]           { length Str --> cl                  }
        jcxz  @exit              { if length Str = 0, exit            }
        mov   bx, si             { save pointer to length byte of Str }
        inc   si                 { 1st char of Str                    }
        mov   di, si             { pointer to 1st char of Str --> di  }
        mov   al, ' '            { hunt for spaces                    }
        xor   dx, dx             { clear dx                           }

@start: { look for leading spaces }
   repe scasb                    { the hunt is on                     }
        jz    @done              { if only spaces, we are done        }
        inc   cx                 { no, don't lose 1st non-blank char  }
        dec   di                 { no, don't lose 1st non-blank char  }
        mov   dx, cx             { new lenght of Str                  }
        xchg  di, si             { swap si and di                     }
    rep movsb                    { move remaining part of Str         }
@done:  mov   [bx], dl           { new length of Str                  }
@exit:
end  { LTrim };


function LineOK( var str: string ) : Boolean; assembler;
  { Line contains chars > ASCII 20h ? }
asm     { setup }
        xor   ax, ax         { assume false return value        }
        xor   cx, cx         { clear cx                         }
        lds   si, str        { load in DS:SI pointer to Str     }
        mov   cl, [si]       { length Str --> cx                }
        jcxz  @exit          { if no characters, exit           }
        inc   si             { point to 1st character           }

        { look for chars > ASCII 20h }
@start: mov   bl, [si]       { load character                   }
        cmp   bl, ' '        { char > ASCII 20h?                }
        ja    @yes           { yes, return true                 }
        inc   si             { next character                   }
        dec   cx             { count down                       }
        jcxz  @exit          { if no more characters left, exit }
        jmp   @start         { try again                        }
@yes:   mov   ax, 1          { return value true                }
@exit:
end  { LineOK };


procedure TestLine( var Line: string );
var
  len: byte absolute Line;

  procedure TrimLine;
  begin
    White2Space( Line, WhiteSpace );  { white space to spaces   }
    RTrim( Line );                    { remove trailing spaces  }
  end;

begin
  TrimLine;
  while not EOF( OldFile ) and ( IOResult = 0 )
  and ((len = 0) or not LineOK( Line )
{$IFDEF NoPlus}
  or (Line[len] = '+')
{$ENDIF}
  ) do
  begin
    ReadLn( OldFile, Line );
  end;
end;  { TestLine }


begin
  if ParamCount > 1 then             { parameters file1 file2 }
  begin
    Fname1 := FExpand( ParamStr( 1 ) );
    Fname2 := FExpand( ParamStr( 2 ) );
    tel := 0;
    if FileExist( Fname1 ) then
    begin
      if OpenTextFile( OldFile, Fname1, OldBuf ) then
      begin
        if CreateTextFile( NewFile, Fname2, NewBuf ) then
        begin
          Line1 := '';
          ReadLn( OldFile, Line2 );

          while not EOF( OldFile ) and ( IOResult = 0 ) do
          begin
            TestLine( Line2 );
            if (CompUCStr( Line1, Line2 ) <> 0) then
            begin
              StrCopy( Line2, Line1 );         { copy Line2 to Line1 }
              WriteLn( NewFile, Line1 );
              inc( tel );
              if (tel mod divisor) = 0 then write( #13, tel, ' unique lines' );
            end;
            ReadLn( OldFile, Line2 );
          end {while not EOF};

          TestLine( Line2 );
          if (length( Line2 ) > 0) and (CompUCStr( Line1, Line2 ) <> 0) then
          begin
            WriteLn( NewFile, Line2 );
            inc( tel );
          end;

          writeln( #13, tel, ' unique lines' );
          Close( NewFile );
          Close( OldFile );
        end { if create file2 }
        else
          writeln(' error creating file ', Fname1 );
        { error creating file }
      end { if open file1 }
      else
        writeln(' error opening file ', Fname1 );
      { error opening file }
    end { if FileExist( Fname1 ) }
    else
      writeln( Fname1, ' not found' );
    { file not found }
  end { if ParamCount > 1 }
  else
    Writeln( 'Dup file1 file2' );
end.

[Back to TEXTFILE SWAG index]  [Back to Main SWAG index]  [Original]