#!/usr/bin/perl -w

# flood.pl - Continually make Web requests to 'random' sites
#
# (C)2005 Ralph Bolton, www.coofercat.com
# Released under the GNU Public License (v2)
# See http://www.gnu.org/copyleft/gpl.html
#
# This script is designed to run permanently on a system. It makes
# periodic web requests to 'random' web sites that it finds by making
# 'random' searches on internet search engines.
#
# The intention of this script is to make numerous web requests. Each
# web request is logged by many ISPs (as they are legally required to
# do so). As such, it is hoped that the sheer number of requests will
# over-burden the ISPs to the point that they have to pressure law
# enforcement to reduce the time they have to keep logs for. Also the
# deluge of automated entries in the logs make finding real human-based
# requests more difficult. Ordinary scanning of the logs for known
# 'bad' sites will yeild unrelated hits, making as much of a mockery
# of that log storage as possible.
#
# In order to limit the effect on the system it is running on, the
# script sleeps for a variable amount of time between requests. It can
# also stop downloading files when they reach a certain size, so as to
# avoid using too much of the available bandwidth. In practice, the
# script does not have a discernable effect on system load or on a
# 512K ADSL connection.
#
# The script uses a series of words, from which it selects a few and makes
# a search engine request with them. The results it gets are 'scraped' from
# the response and added to a list. The list is then fetched, one by one
# with a pause between each. Once the list is emptied, another random search
# takes place to fill the list once more.
#
# To avoid endlessly visiting search engine pages (such as caches or advertising)
# the script can avoid certain URLs (based on each search engine). Also, it can
# completely avoid URLs, if you wish to do that.
#
# To make this script more useful, it also uses a series of User-Agent strings
# when it makes requests. This means that the traffic from the script is less
# easy to filter from ordinary logs (although if you only ever use one browser
# yourself, you may wish to trim the list of User Agents so that it's impossible
# to tell what you downloaded and what the script downloaded!).
#
# ===============================================================================
# Configuration (yes, you have to edit code to configure it)
#
# Insert your favourite search engines in here. Be aware that Yahoo does not
# return direct links to destination sites (their results are a request to Yahoo
# which redirects to the intended destination). As such, these sorts of results
# are essentially unsuitable for this sort of work :-(
#
# Each search engine needs a URL (which will have the search terms appended to it)
# There can (and should!) be a list of URLs to avoid; normally this will include
# the search engine itself so the script doesn't download cache or advertising
# pages.

@search_engines=(	{	url => 'http://www.google.co.uk/search?q=',
				never_visit => [ '.google.', '216.239.' ] },

			{	url => 'http://search.msn.com/results.aspx?q=',
				never_visit => [ '.msnscache.com', 'msn.com', 'hotmail.com' ] } );

# User Agents. This list of agents is used in requests. One will be picked
# at random for each request. Note: Places like Google only respond to 'proper'
# user agent strings, so it may not be advisable to put non-browser strings here.

@user_agents=(	'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7) Gecko/20040616',
		'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
		'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.1.4322)',
		'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
		'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.8) Gecko/20050513 Fedora/1.0.4-1.3.1 Firefox/1.0.4',
		'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.01',
		'Mozilla/5.0 (Windows; U; Windows NT 5.1; fr-FR; rv:1.7.8) Gecko/20050511 Firefox/1.0.4');

# Never Visit. This is a list of URLs that will never be visited. If you're worried
# that you might be responsible for hitting dubious web sites (such as Fox, CNN etc)
# then add them in this list.
@never_visit=();

# Various other configuration options. The MIN_SLEEP_TIME defines the minimum sleep time
# between requests. RANDOM_SLEEP_TIME is the randomisation to add to this. In other words,
# the actual sleep will be MIN_SLEEP_TIME + some_amount_of(RANDOM_SLEEP_TIME).
#
# If the words file is not found, then a 'hard coded' list is used (see below). Ideally,
# point the script at a file that has words in it, one per line. Otherwise, make the
# hard coded list a lot longer!
#
# MAX_SEARCH_WORDS defines how many words to stick together in requests. The script may
# use less words, this is just the maximum.
#
# PID_FILE defines where the script will drop a file with it's process ID in it. Leave
# empty for no PID file.
#
# MAX_SIZE determines the most we'll ever download in a single request. This stops the
# script scooping up all your bandwidth downloading some multi-gigabyte file.

use constant MIN_SLEEP_TIME => 45;
use constant RANDOM_SLEEP_TIME => 30;
use constant WORDS_FILE => '/usr/share/dict/words';
use constant MAX_SEARCH_WORDS => 4;
use constant PID_FILE => '/var/tmp/flood.pid';
use constant MAX_SIZE => 1000000;

# Search words. If the file (above) is not found, then this list of words is used.
@search_words=('flex','chemical','technology','computer','fishing','football','news');

# =================================================================================================

use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;

# We just create one of these, globally
$ua = LWP::UserAgent->new(max_size => MAX_SIZE);

# regexify: Escape text to make it 'regex friendly'.
sub regexify
{
        my ($data)=@_;
        # Make $new regex friendly (see Perl book, p59)
        # Remember: "." and "/" are also important! (not specifically mentioned in the book)
        $data=~s/([\\\|\(\)\[\]\{\}\^\$\*\+\?\.\/]{1})/\\$1/g;
        return $data;
}

# get_random_ua: Return a random User-Agent string from the list.
sub get_random_ua
{
	my $num=rand($#user_agents + 1);
	return $user_agents[$num];
}

# Set up a callback that collects links
my @_callback_links = ();
sub _get_links_callback
{
	my($tag, %attr) = @_;
	# Get any kind of link, a..href, img, etc.
	#return if $tag ne 'a';
	push(@_callback_links, values %attr);
}

sub get_links
{
	my ($url)=@_;

	@_callback_links=();

	# Make the parser.  Unfortunately, we don't know the base yet
	# (it might be diffent from $url)
	$p = HTML::LinkExtor->new(\&_get_links_callback);

	$ua->agent(&get_random_ua);

	# Request document and parse it as it arrives
	$res = $ua->request(HTTP::Request->new(GET => $url),
	                      sub {$p->parse($_[0])});

	# Expand all image URLs to absolute ones
	my $base = $res->base;
	return map { $_ = url($_, $base)->abs; } @_callback_links;
}

sub strip_links
{
	my ($se,$links_ref)=@_;

	my $test;

	foreach $test (@{$se->{never_visit}}, @never_visit)
	{
		my $string=&regexify($test);
		@$links_ref=grep(!/$string/,@$links_ref);
	}
	return 1;
}

sub get_random_se
{
	my $num=rand($#search_engines + 1);

	return $search_engines[$num];
}

sub make_random_search_term
{
	# First, decide how many words to use...
	my $count=rand(MAX_SEARCH_WORDS);
	my $i;
	my @words=();

	for($i=0; $i<$count; $i++)
	{
		# Get a word from the list...
		my $word=$search_words[rand($#search_words + 1)];
		# Remove it from the list if it's already in there...
		@words=grep( { "$_" ne "$word" } @words);
		# Now add it to the list...
		push @words,$word;
	}

	# Separate with a + so it's URL friendly
	return join('+',@words);
}

sub seed_words
{
	return 0 unless(open(WORDS,"< " . WORDS_FILE));
	undef $/;
        my $temp=<WORDS>;
        close(WORDS);
        @search_words=split(/\s*\n\s*/,$temp);
	$temp="";

	my $count=$#search_words + 1;
	print "Seeded search words with $count words from " . WORDS_FILE . "\n";
	return 1;
}


MAIN:
{
	if(!defined($ARGV[0]))
	{
		eval("use Proc::Daemon;");
		&Proc::Daemon::init unless($@);

		if(PID_FILE ne "")
		{
			# Drop a PID
			if(open(PID,"> " . PID_FILE))
			{
				print PID "$$\n";
				close(PID);
			}
		}
	}
	else
	{
		# Go unbuffered...
		$|=1;
	}

	# Seed the search words...
	&seed_words;

	# Loop forever, make a request to a search engine, then
	# request each URL it gave us to look at.
	while(1)
	{
		my $terms=&make_random_search_term;
		my $se=&get_random_se;

		my $url=$se->{url} . $terms;
		print "Fetching links from $url...";
		my @links=&get_links($url);
		&strip_links($se,\@links);

		if($#links == -1)
		{
			# Got no links from this one...
			print "No Links!\n";
		}
		else
		{
			print "Done\n";
		}

		my $sleep_time=MIN_SLEEP_TIME + rand(RANDOM_SLEEP_TIME);

		sleep($sleep_time);

		# Now visit every URL...
		my $item;
		foreach $item (@links)
		{
			$ua->agent(&get_random_ua);
			print " Fetching $item...";
			$res = $ua->request(HTTP::Request->new(GET => $item));
			print "Done (" . $res->code . ").\n";
			# Now wait for the specifed sleep time...
			$sleep_time=MIN_SLEEP_TIME + rand(RANDOM_SLEEP_TIME);
			sleep($sleep_time) if($res->code <= 500);
		}
	}
}