+++ /dev/null
-#!/usr/bin/perl -w
-=head1 NAME
-
-find-hidden-word-text - find hidden text in MS Word documents
-
-=head1 SYNOPSIS
-
-find-hidden-word-text word.doc > hidden.txt
-
-=head1 DESCRIPTION
-
-This is a command-line UNIX tool to ease the task of discovering hidden text
-in MS Word documents.
-
-More specifically, it is an implementation of Method 2 from Simon Byers'
-paper, _Scalable Exploitation of, and Responses to Information Leakage
-Through Hidden Data in Published Documents_, at
-<URL:http://www.user-agent.org/word_docs.pdf>.
-
-This goes a little further in that it removes some common 'noise' strings,
-like 'Word.Document.8', 'Title', 'PAGE', 'Microsoft Word Document' and
-the like. It will also remove any strings that do not contain at least
-1 whitespace character.
-
-=head1 PREREQUISITES
-
-This tool requires antiword be installed.
-
-=head1 AUTHOR
-
-Justin Mason, C<jm dash wordtext at jmason dot org>
-
-=head1 VERSION
-
-1.0 Aug 15 2003 jm
-
-=cut
-
-my $print_names = 0;
-if (scalar @ARGV > 1) { $print_names = 1; }
-
-foreach my $file (@ARGV) {
- if ($print_names) {
- print "\n$file\n\n";
- }
-
- open (IN, "antiword -t $file |") or die "cannot run antiword";
- my $aw = join ('', <IN>);
- close IN or die "cannot run antiword -t $file";
-
- open (IN, "strings $file |") or die "cannot run strings";
- my $str = join ('', <IN>);
- close IN;
-
- # normalize the antiword version
- $aw =~ s/\s+/ /gs;
-
- # get each string from strings, and see if we can find it in the "visible"
- # text from antiword
- my %count = ();
- foreach (split (/\n/, $str)) {
- s/\s+/ /g; s/^ //gs; s/ $//gs;
- next if ($aw =~ /\Q$_\E/);
-
- # killfile.
- # skip almost-entirely non-alpha 4-byte snippets
- #next if /^(?:\W\w\W\W|\W\W\w\W|\w\W{3,3}|\W{4,4}|\W{3,3}\w)$/;
-
- next if (!/ /); # no spaces!
-
- # skip 4-to-6-byte snippets with 1 nonalpha and no spaces
- #next if (/^\S{4,6}$/ && /\W/);
-
- # common word droppings
- next if /^\s*PAGE\s*$/;
- #next if /^Word.Document.\d$/;
- next if /^Microsoft Word 9.0$/;
- next if /^Microsoft Word Document$/;
- #next if /^Normal$/;
- #next if /^Title$/;
- #next if /^MSWordDoc$/;
- next if /^Click to edit Master text styles$/;
- next if /^Click to edit Master title style$/;
- next if /^Embedded OLE Servers$/;
-
- $count{$_}++;
- }
-
-# output the strings and their counts
- foreach (sort keys %count) {
- my $cnt = $count{$_};
- print "$cnt|$_\n";
- }
-}