Saturday, October 11, 2003

Perl script to test list of URLs for URLs blocked by proxy server.

------------------------------------------------------------------
#!/usr/bin/perl -w
# Xtract & CHallenge
# Extract URLs from a given file and challenge the proxy server with the URLs
# Optionally 2 or 3 arguments:
# xch input-file [scratch-file] output-file
# If only 2 arguments are given, the scratch-file will be named output-file.scr
#
# URL-matching regexp modified from qxurl by tchrist@perl.com in perlfaq9
#

# Check for options

# Default ISP is SCV
$isp = "scv";

if (($ARGV[0] eq "-scv") or ($ARGV[0] eq "-mst") or ($ARGV[0] eq "-msh")) {
# Set ISP
$isp = substr($ARGV[0], 1);
# Go to next argument
shift @ARGV;
}
elsif (($ARGV[0] =~ /-/) and ($ARGV[0] ne "-c") and ($ARGV[0] ne "-x")) {
die "Usage: xch [-scv|msh|mst] [-c|v] source-file [intermediate-file] out-file";
}
else {
# Continue
}

if ($isp eq "scv") {
$isp_long = "SCV MaxOnline"}
elsif ($isp eq "msh") {
$isp_long = "MyStarHub"}
else {
$isp_long = "MySingTel"}

print STDOUT "ISP is $isp_long\n";

if ($ARGV[0] =~ /-/) {
if ($ARGV[0] eq "-c") {
# Challenge only
$x = 0;
$c = 1;
}
if ($ARGV[0] eq "-x") {
# Xtract only
$x = 1;
$c = 0;
}
shift @ARGV;
}
else {
# Default - Do both
$x = 1;
$c =1;
}


if (!($x && $c)) {
die "Can only specify 3 filenames when neither -c nor -x is set" if ($#ARGV == 2);
if ($#ARGV == 1) {
if ($x) {
($infile, $scratch) = @ARGV;
}
if ($c) {
($scratch, $outfile) = @ARGV;
}
}
else {
die "Usage: xch -x source_file challenge_list\nOr: xch -c challenge_list blocked_list";
}
}
else {
if ($#ARGV == 1) {
($infile, $outfile) = @ARGV;
$scratch = $outfile . '.scr';
# print "$infile $scratch $outfile" ;
}
elsif ($#ARGV == 2) {
($infile, $scratch, $outfile) = @ARGV;
}
else{
die "Usage: $0 raw-list [intermediate] blocked_list \n";
}
}

# Main

if ($x) {
xtract_URL();
}

if ($c) {
challenge_server();
}

#############################################
sub xtract_URL {

open INPUT, "< $infile";
open XTRACT, "> $scratch";

$links = 1;

print STDOUT "\nNote that xch does not rework links to www.....\nxcp is recommended.\n";

while (<INPUT>) {
$url = 0;
if (m{ clickurl=http:// (.*?) ["/] }gsix) {
$url = "http://$1";
}
elsif (m{ url=http:// (.*?) ["/] }gsix) {
$url = "http://$1";
}
elsif (m{ < \s* A \s+ HREF \s* = \s* (["']) http:// (.*?) \s* ["/] }gsix)
{$url = "http://$2";}
if ($url) {
print STDOUT $links++, "\t$url\n";
if ($url !~ /\?/) {
print XTRACT "$url\n"
}
}
}
close XTRACT;
#Close subroutine xtract_URL
}

################################################

sub challenge_server {

open CHALLENGE, "< $scratch";
open RESPONSE, "> $outfile" ;

#Set proxy response
#Note that this will change depending on ISP


# Set search string according to ISP
# SCV Maxonline




if ($isp eq "scv") {
$proxy = "";
$banned = "The URL you requested has been blocked by the caching server";
$banned2 = "Please contact your System Administrator with any questions";
}

# MySingtel
if ($isp eq "mst"){
$proxy = "--proxy proxy.zapsurf.com.sg:8080";
$banned = "302 Redirect";
$banned2 = "http://www.singnet.com.sg/cache/unable.html";
}

# MyStarHub
if ($isp eq "msh"){
$proxy = "--proxy.mystarhub.com.sg:8080";
$banned = "302 Redirect";
$banned2 = "http://www1.starhub.net.sg/proxy/access.html";
}

while () {
chop $_;
$line += 1;
# SCV has changed its proxy error response. Now have to check webpage returned
# rather than just the header
if ($isp eq "scv") {
$result = `curl --silent --connect-timeout 1 --max-time 3 $proxy $_\n`;
}
else {
$result = `curl --head --silent --connect-timeout 1 --max-time 3 $proxy $_\n`;
}
if (($result =~ /$banned/) and ($result =~ /$banned2/))
{
# Found a Forbidden site
# Use strict matching to avoid false positives from 403's at remote server
print RESPONSE "$_\n";
print STDOUT "$_ blocked \n";
}
else {
print STDOUT "$line ";
}
}
print "\n";
}

No comments: