#!/usr/bin/perl -w # flood.pl - Continually make Web requests to 'random' sites # # (C)2005 Ralph Bolton, www.coofercat.com # Released under the GNU Public License (v2) # See http://www.gnu.org/copyleft/gpl.html # # This script is designed to run permanently on a system. It makes # periodic web requests to 'random' web sites that it finds by making # 'random' searches on internet search engines. # # The intention of this script is to make numerous web requests. Each # web request is logged by many ISPs (as they are legally required to # do so). As such, it is hoped that the sheer number of requests will # over-burden the ISPs to the point that they have to pressure law # enforcement to reduce the time they have to keep logs for. Also the # deluge of automated entries in the logs make finding real human-based # requests more difficult. Ordinary scanning of the logs for known # 'bad' sites will yeild unrelated hits, making as much of a mockery # of that log storage as possible. # # In order to limit the effect on the system it is running on, the # script sleeps for a variable amount of time between requests. It can # also stop downloading files when they reach a certain size, so as to # avoid using too much of the available bandwidth. In practice, the # script does not have a discernable effect on system load or on a # 512K ADSL connection. # # The script uses a series of words, from which it selects a few and makes # a search engine request with them. The results it gets are 'scraped' from # the response and added to a list. The list is then fetched, one by one # with a pause between each. Once the list is emptied, another random search # takes place to fill the list once more. # # To avoid endlessly visiting search engine pages (such as caches or advertising) # the script can avoid certain URLs (based on each search engine). Also, it can # completely avoid URLs, if you wish to do that. # # To make this script more useful, it also uses a series of User-Agent strings # when it makes requests. This means that the traffic from the script is less # easy to filter from ordinary logs (although if you only ever use one browser # yourself, you may wish to trim the list of User Agents so that it's impossible # to tell what you downloaded and what the script downloaded!). # # =============================================================================== # Configuration (yes, you have to edit code to configure it) # # Insert your favourite search engines in here. Be aware that Yahoo does not # return direct links to destination sites (their results are a request to Yahoo # which redirects to the intended destination). As such, these sorts of results # are essentially unsuitable for this sort of work :-( # # Each search engine needs a URL (which will have the search terms appended to it) # There can (and should!) be a list of URLs to avoid; normally this will include # the search engine itself so the script doesn't download cache or advertising # pages. @search_engines=( { url => 'http://www.google.co.uk/search?q=', never_visit => [ '.google.', '216.239.' ] }, { url => 'http://search.msn.com/results.aspx?q=', never_visit => [ '.msnscache.com', 'msn.com', 'hotmail.com' ] } ); # User Agents. This list of agents is used in requests. One will be picked # at random for each request. Note: Places like Google only respond to 'proper' # user agent strings, so it may not be advisable to put non-browser strings here. @user_agents=( 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7) Gecko/20040616', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.1.4322)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.8) Gecko/20050513 Fedora/1.0.4-1.3.1 Firefox/1.0.4', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.01', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; fr-FR; rv:1.7.8) Gecko/20050511 Firefox/1.0.4'); # Never Visit. This is a list of URLs that will never be visited. If you're worried # that you might be responsible for hitting dubious web sites (such as Fox, CNN etc) # then add them in this list. @never_visit=(); # Various other configuration options. The MIN_SLEEP_TIME defines the minimum sleep time # between requests. RANDOM_SLEEP_TIME is the randomisation to add to this. In other words, # the actual sleep will be MIN_SLEEP_TIME + some_amount_of(RANDOM_SLEEP_TIME). # # If the words file is not found, then a 'hard coded' list is used (see below). Ideally, # point the script at a file that has words in it, one per line. Otherwise, make the # hard coded list a lot longer! # # MAX_SEARCH_WORDS defines how many words to stick together in requests. The script may # use less words, this is just the maximum. # # PID_FILE defines where the script will drop a file with it's process ID in it. Leave # empty for no PID file. # # MAX_SIZE determines the most we'll ever download in a single request. This stops the # script scooping up all your bandwidth downloading some multi-gigabyte file. use constant MIN_SLEEP_TIME => 45; use constant RANDOM_SLEEP_TIME => 30; use constant WORDS_FILE => '/usr/share/dict/words'; use constant MAX_SEARCH_WORDS => 4; use constant PID_FILE => '/var/tmp/flood.pid'; use constant MAX_SIZE => 1000000; # Search words. If the file (above) is not found, then this list of words is used. @search_words=('flex','chemical','technology','computer','fishing','football','news'); # ================================================================================================= use LWP::UserAgent; use HTML::LinkExtor; use URI::URL; # We just create one of these, globally $ua = LWP::UserAgent->new(max_size => MAX_SIZE); # regexify: Escape text to make it 'regex friendly'. sub regexify { my ($data)=@_; # Make $new regex friendly (see Perl book, p59) # Remember: "." and "/" are also important! (not specifically mentioned in the book) $data=~s/([\\\|\(\)\[\]\{\}\^\$\*\+\?\.\/]{1})/\\$1/g; return $data; } # get_random_ua: Return a random User-Agent string from the list. sub get_random_ua { my $num=rand($#user_agents + 1); return $user_agents[$num]; } # Set up a callback that collects links my @_callback_links = (); sub _get_links_callback { my($tag, %attr) = @_; # Get any kind of link, a..href, img, etc. #return if $tag ne 'a'; push(@_callback_links, values %attr); } sub get_links { my ($url)=@_; @_callback_links=(); # Make the parser. Unfortunately, we don't know the base yet # (it might be diffent from $url) $p = HTML::LinkExtor->new(\&_get_links_callback); $ua->agent(&get_random_ua); # Request document and parse it as it arrives $res = $ua->request(HTTP::Request->new(GET => $url), sub {$p->parse($_[0])}); # Expand all image URLs to absolute ones my $base = $res->base; return map { $_ = url($_, $base)->abs; } @_callback_links; } sub strip_links { my ($se,$links_ref)=@_; my $test; foreach $test (@{$se->{never_visit}}, @never_visit) { my $string=®exify($test); @$links_ref=grep(!/$string/,@$links_ref); } return 1; } sub get_random_se { my $num=rand($#search_engines + 1); return $search_engines[$num]; } sub make_random_search_term { # First, decide how many words to use... my $count=rand(MAX_SEARCH_WORDS); my $i; my @words=(); for($i=0; $i<$count; $i++) { # Get a word from the list... my $word=$search_words[rand($#search_words + 1)]; # Remove it from the list if it's already in there... @words=grep( { "$_" ne "$word" } @words); # Now add it to the list... push @words,$word; } # Separate with a + so it's URL friendly return join('+',@words); } sub seed_words { return 0 unless(open(WORDS,"< " . WORDS_FILE)); undef $/; my $temp=; close(WORDS); @search_words=split(/\s*\n\s*/,$temp); $temp=""; my $count=$#search_words + 1; print "Seeded search words with $count words from " . WORDS_FILE . "\n"; return 1; } MAIN: { if(!defined($ARGV[0])) { eval("use Proc::Daemon;"); &Proc::Daemon::init unless($@); if(PID_FILE ne "") { # Drop a PID if(open(PID,"> " . PID_FILE)) { print PID "$$\n"; close(PID); } } } else { # Go unbuffered... $|=1; } # Seed the search words... &seed_words; # Loop forever, make a request to a search engine, then # request each URL it gave us to look at. while(1) { my $terms=&make_random_search_term; my $se=&get_random_se; my $url=$se->{url} . $terms; print "Fetching links from $url..."; my @links=&get_links($url); &strip_links($se,\@links); if($#links == -1) { # Got no links from this one... print "No Links!\n"; } else { print "Done\n"; } my $sleep_time=MIN_SLEEP_TIME + rand(RANDOM_SLEEP_TIME); sleep($sleep_time); # Now visit every URL... my $item; foreach $item (@links) { $ua->agent(&get_random_ua); print " Fetching $item..."; $res = $ua->request(HTTP::Request->new(GET => $item)); print "Done (" . $res->code . ").\n"; # Now wait for the specifed sleep time... $sleep_time=MIN_SLEEP_TIME + rand(RANDOM_SLEEP_TIME); sleep($sleep_time) if($res->code <= 500); } } }