Commit ac402657 authored by David Prévot's avatar David Prévot

Add (broken?) urlcheck scripts to the repository

parent bacab655
The current cronjob looks like:
# urlcheck scripts -- Responsible: djpig
17 3 * * * cd $HOME/urlcheck && ./run.urlcheck
36 12 * * * cd $HOME/urlcheck && ./make.bad_link.pages
5 13 * * * cd $HOME/urlcheck && ./cleanup.logs
#!/usr/bin/perl -w
# only include pages for which at least one error was reported
use strict;
my $strn = $ARGV[0];
my $line = '';
my $out = '';
if ($strn eq 'other') {
$strn = '[^/]+\n';
}
else {
$strn = "$strn/";
}
while (<STDIN>) {
if (/^Looking/ and ($line ne '')) {
# print $line;
if ($line =~ m,^Looking into http://www.debian.org/$strn,) {
$out .= $line;
}
$line = $_;
}
else {
$line .= $_;
}
}
if ($out ne '') {
print <<END;
This is a list of bad links found in pages in this directory or
file on the Debian web site. Please fix them.
Note that not all URLs listed are bad. The error message should tell
you why the URL was flagged. In particular, in the interest of time,
the program times out faster than some very slow connections need to
finish the request.
END
print $out;
}
#!/bin/sh
find logs -daystart -mtime +7 -exec rm {} \;
find logs -daystart -mtime +2 -not -name '*.gz' -exec gzip -9 {} \;
#!/usr/bin/perl -w
use strict;
# Usage: ./extract_lang.pl lang file
# where 'lang' is the 2-letter ISO abbreviation for the language and
# 'file' is the name of the file you wish to extract data from
# Example: ./extract_lang.pl es releases/potato
# will extract the list of bad links from the Spanish translation from
# the file releases/potato
my $lang = $ARGV[0];
my $file = $ARGV[1];
if (!$lang) {
print "The first argument must be the 2-letter ISO language.";
print "The second argument must be the file to work on";
exit(1);
}
if (!$file) {
die "The second argument must be a filename";
}
if (-f $file) {
open(FIL, "<$file");
}
else {
die "no such file, $file";
}
my $search = "^Looking into http:.*\.$lang\.html\n";
my $line = "";
my $section = "";
my $blank;
foreach (<FIL>) {
if (/^Looking into/) {
$line = $_;
if ($section and not $blank) {
process($section);
$blank = 1;
}
$section = $line;
}
else {
$section .= $_;
$blank = 0;
}
}
if ($section and not $blank) {
process($section);
}
sub process {
my ($section) = @_;
if ($section =~ m,^Looking into http://[\w/.-]+\n$,m) {
return;
}
if ($section =~ m,$search,) {
print $section;
}
}
#!/bin/sh
date="$1"
if [ -z $date ] ; then
date=`date +%Y%m%d`
fi
topdir=/srv/www-master.debian.org/htdocs/build-logs/urlcheck
[ -d $topdir ] || mkdir $topdir
dir='MailingLists banners Bugs consultants events intro legal logos mirror misc partners volatile vote y2k other'
./prune.pl logs/web.$date > logs/web.$date.bad
for loc in $dir ; do
./bad_pages.pl $loc < logs/web.$date.bad > $topdir/$loc ;
./extract_lang.pl en $topdir/$loc > $topdir/$loc.en
done
# Disabled, see note in run.urlcheck
# [ -d $topdir/security ] || mkdir $topdir/security
# dir='security/1997 security/1998 security/1999 security/2000 security/2001 security/2002 security/2003 security/2004 security/2005 security/2006 security/2007 security/2008 security/2009 security/2010 security/2011 security/undated'
# ./prune.pl logs/web.$date.sec > logs/web.$date.sec.bad
# for loc in $dir ; do
# ./bad_pages.pl $loc < logs/web.$date.sec.bad > $topdir/$loc ;
# ./extract_lang.pl en $topdir/$loc > $topdir/$loc.en
# done
[ -d $topdir/releases ] || mkdir $topdir/releases
dir='releases/stable releases/testing releases/unstable releases/slink releases/potato releases/woody releases/sarge releases/etch releases/lenny releases/2.1 releases/2.2 releases/3.0 releases/3.1 releases/4.0 releases/5.0'
./prune.pl logs/web.$date.rel > logs/web.$date.rel.bad
for loc in $dir ; do
./bad_pages.pl $loc < logs/web.$date.rel.bad > $topdir/$loc ;
./extract_lang.pl en $topdir/$loc > $topdir/$loc.en
done
dir='CD'
./prune.pl logs/web.$date.cd > logs/web.$date.cd.bad
for loc in $dir ; do
./bad_pages.pl $loc < logs/web.$date.cd.bad > $topdir/$loc ;
./extract_lang.pl en $topdir/$loc > $topdir/$loc.en
done
dir='devel'
./prune.pl logs/web.$date.devel > logs/web.$date.devel.bad
for loc in $dir ; do
./bad_pages.pl $loc < logs/web.$date.devel.bad > $topdir/$loc ;
./extract_lang.pl en $topdir/$loc > $topdir/$loc.en
done
dir='distrib'
./prune.pl logs/web.$date.distrib > logs/web.$date.distrib.bad
for loc in $dir ; do
./bad_pages.pl $loc < logs/web.$date.distrib.bad > $topdir/$loc ;
./extract_lang.pl en $topdir/$loc > $topdir/$loc.en
done
dir='doc'
./prune.pl logs/web.$date.doc > logs/web.$date.doc.bad
for loc in $dir ; do
./bad_pages.pl $loc < logs/web.$date.doc.bad > $topdir/$loc ;
./extract_lang.pl en $topdir/$loc > $topdir/$loc.en
done
dir='international'
./prune.pl logs/web.$date.intl > logs/web.$date.intl.bad
for loc in $dir ; do
./bad_pages.pl $loc < logs/web.$date.intl.bad > $topdir/$loc ;
./extract_lang.pl en $topdir/$loc > $topdir/$loc.en
done
dir='users'
./prune.pl logs/web.$date.users > logs/web.$date.users.bad
for loc in $dir ; do
./bad_pages.pl $loc < logs/web.$date.users.bad > $topdir/$loc ;
./extract_lang.pl en $topdir/$loc > $topdir/$loc.en
done
[ -d $topdir/News ] || mkdir $topdir/News
dir='News/1998 News/1999 News/2000 News/2001 News/2002 News/2003 News/2004 News/2005 News/2006 News/2007 News/2008 News/2009 News/2010 News/2011 News/weekly'
./prune.pl logs/web.$date.news > logs/web.$date.news.bad
for loc in $dir ; do
./bad_pages.pl $loc < logs/web.$date.news.bad > $topdir/$loc ;
./extract_lang.pl en $topdir/$loc > $topdir/$loc.en
done
[ -d $topdir/ports ] || mkdir $topdir/ports
dir='ports/alpha ports/arm ports/amd64 ports/arm ports/beowulf ports/freebsd ports/hppa ports/hurd ports/i386 ports/ia64 ports/kfreebsd-gnu ports/m68k ports/mips ports/netbsd ports/powerpc ports/s390 ports/sparc ports/sparc64'
./prune.pl logs/web.$date.ports > logs/web.$date.ports.bad
for loc in $dir ; do
./bad_pages.pl $loc < logs/web.$date.ports.bad > $topdir/$loc ;
./extract_lang.pl en $topdir/$loc > $topdir/$loc.en
done
www.corel.com
www.sun.se
www.mapblast.com
www.borders.com
www.sun.no
#!/usr/bin/perl -w
use strict;
my $file = $ARGV[0];
if (!$file) {
die "a file must be given as a command line argument";
}
if (-f $file) {
open(FIL, "<$file");
}
else {
die "no such file, $file";
}
my $line = "";
my $section = "";
my $blank = 0;
foreach (<FIL>) {
if (/^Looking into/) {
$line = $_;
if ($section and not $blank) {
process($section);
$blank = 1;
}
$section = $line;
}
else {
$section .= $_;
$blank = 0;
}
}
sub process {
my ($section) = @_;
#if ($section =~ /^Looking into(.*?)\$(\n .*?$)*\Z(?!\n)/sg) {
# print "one gone\n";
# return;
#}
$section =~ s,\n http://(.*)\(200\) OK$,,mgo;
$section =~ s,\n http://(.*)\(200\) Document,,mgo;
$section =~ s,\n http://(.*)\(200\) Request,,mgo;
$section =~ s,\n (http://\S+).*?\(301\) Moved Permanently. New URL: \1/\s$,,mg;
$section =~ s,\n http://(.*)\(302\) Found,,mgo;
$section =~ s,\n http://(.*)\(302\) Moved,,mgo;
$section =~ s,\n http://(.*)\(302\) Temporary,,mgo;
$section =~ s,\n http://(.*)\(302\) Object,,mgo;
$section =~ s,\n mailto:(.*)validity$,,mgo;
$section =~ s,\n mailto:(.*) invalid url$,,mgo;
$section =~ s,\n ftp:(.*)is ok$,,mgo;
$section =~ s,\n https:(.*)validity$,,mgo;
$section =~ s,\n news:(.*)validity$,,mgo;
$section =~ s,\n news:(.*) invalid url$,,mgo;
if ($section =~ m,^Looking into http://[\w/.-]+\n$,m) {
return;
}
elsif ($section =~ m,^Looking into http://(.*?)\n searching not continued in (.*) directory.\n$,m) {
return;
}
$section =~ s/error\n/error /g;
print $section;
}
#!/bin/bash
# urlcheck.py [--require STRING] [--ignore STRING] URL...
date=`date +%Y%m%d`
[ -e ./logs ] || mkdir ./logs
./urlcheck.py --require www.debian.org/ --ignore www.debian.org/~ --ignore /Packages/ \
--ignore News/weekly/oldurl --ignore /Lists-Archives --ignore /cgi-bin/fom \
--ignore debian.org/fom --ignore /releases/ --ignore /international/ --ignore /security/ \
--ignore /devel/ --ignore /News/ --ignore /doc/ --ignore /distrib/ \
--ignore /ports/ --ignore /intl/ \
http://www.debian.org/ >& logs/web.$date &
./urlcheck.py --require www.debian.org/international http://www.debian.org/international/ \
>& logs/web.$date.intl &
./urlcheck.py --require www.debian.org/devel http://www.debian.org/devel/ \
>& logs/web.$date.devel &
./urlcheck.py --require www.debian.org/News http://www.debian.org/News/ \
>& logs/web.$date.news &
./urlcheck.py --require www.debian.org/doc http://www.debian.org/doc/ \
>& logs/web.$date.doc &
./urlcheck.py --require www.debian.org/distrib http://www.debian.org/distrib/ \
>& logs/web.$date.distrib &
./urlcheck.py --require www.debian.org/releases http://www.debian.org/releases/ \
>& logs/web.$date.rel &
# Running urlcheck on /security produces huge logs of broken URLs due to e.g. distributions and files having been replaced
# ./urlcheck.py --require www.debian.org/security http://www.debian.org/security/ \
# >& logs/web.$date.sec &
./urlcheck.py --require www.debian.org/ports http://www.debian.org/ports/ \
>& logs/web.$date.ports &
./urlcheck.py --require www.debian.org/CD http://www.debian.org/CD/ \
>& logs/web.$date.cd &
./urlcheck.py --require www.debian.org/users http://www.debian.org/users/ \
>& logs/web.$date.users &
# ./urlcheck.py --require spi-inc.org/ http://www.spi-inc.org/ >& spi.$date
test a local file
<a href="http:invalid/url">invalid url</a>
<a href="./test2.html">test2.html</a><br>
an invalid ftp
<a href="ftp://ftp.us.debian.org/debian/ls-jdjlR">ftp://ftp.debian.org/debian/ls-lR</a><br>
a valid ftp
<a href="ftp://ftp.us.debian.org/debian/ls-lR">ftp://ftp.debian.org/debian/ls-lR</a><br>
<a href="ftp://ftp.au.debian.org/debian/">ftp://ftp.au.debian.org/debian/</a><br>
<a href="ftp://ftp.br.debian.org/debian/">ftp://ftp.br.debian.org/debian/</a><br>
testing a redirect (301)
<a href="http://www.msu.edu/">http://www.msu.edu/</a><br>
a (404) Not Found
<a href="http://www.above.net/">http://www.above.net/</a><br>
a (403) Forbidden
<a href="http://www.it.net.au/">http://www.it.net.au/</a><br>
<a href="http://www.kachinatech.com/">http://www.kachinatech.com/</a><br>
do a header dump
<a href="http://cgi.debian.org/cgi-bin/headers.pl">http://cgi.debian.org/cgi-bin/headers.pl</a><br>
main test
<a href="http://www.alpha-processor.com/">http://www.alpha-processor.com/</a><br>
<a href="http://os.inf.tu-dresden.de/index_e.html">http://os.inf.tu-dresden.de/index_e.html</a><br>
badh url and http://os.inf.tu-dresden.de/ is in german
<a href="http://www.alpha-processor.com/"></a><br>
<a href="http://www.corel.com/"></a><br>
<a href="http://www.sun.se/"></a><br>
<a href="http://www.mapblast.com/"></a><br>
<a href="http://kt.linuxcare.com/KC/debian-hurd/"></a><br>
<a ref=""></a><br>
<a href="http://www.debian.org/">http://www.debian.org/</a><br>
#!/usr/bin/python
# Copyright 2001-2003 James A. Treacy
# This code may be distributed under the terms of the GPL
import sys, urllib, htmllib, httplib, ftplib, formatter, urlparse, socket;
import re, signal
from sgmllib import SGMLParser
TIMEOUT = 5
def handler(signum, frame):
# print 'Signal handler called with signal', signum
raise IOError, "timed out accessing site"
def do_page(page_url):
print "Looking into", page_url
try:
signal.signal(signal.SIGALRM, handler)
signal.alarm(TIMEOUT)
current = urllib.urlopen(page_url)
signal.alarm(0)
# print current.geturl()
# print current.read()
except (IOError, EOFError), arg:
print "Error accessing page:", arg.args[0]
return
parse = htmllib.HTMLParser(formatter.NullFormatter())
parse.feed(current.read())
parse.close()
#if debug:
# print parse.anchorlist
for url in parse.anchorlist:
parts = urlparse.urlparse(url)
parts = parts[0], parts[1], parts[2], parts[3], parts[4], ''
urlnew = urlparse.urlunparse(parts)
# print "TESTING:", url, urlnew
url = urlparse.urljoin(page_url, urlnew)
# print url
check_url(url)
httpdone[url] = 1
def check_url(url):
# check url exist
parts = urlparse.urlparse(url)
parts = parts[0], parts[1], parts[2], parts[3], parts[4], ''
urlnew = urlparse.urlunparse(parts)
if debug:
if urlnew != url:
print "DEBUG: ", url, "converted to", urlnew
url = urlnew
if urlseen.has_key(url):
if re.search('\(200\)', urlseen[url]) or re.search('\(302\)', urlseen[url]):
# print " found a good url so not printing it"
pass
else:
print " " + url + urlseen[url]
return
else:
msg = "If you see this, then msg didn't get set properly"
if parts[0] == 'ftp':
msg = check_ftp(url, parts)
elif parts[0] == 'https':
msg = " : can't check https for validity"
elif parts[0] == 'mailto':
msg = " : can't check mailto for validity"
elif parts[0] == 'news':
msg = " : can't check news for validity"
elif parts[0] == 'file':
try:
current = urllib.urlopen(url)
httplist.append(url)
msg = " : is ok"
except IOError, arg:
msg = " : Error: "+ arg.args[1]
elif parts[0] == 'http':
msg = check_http(url, parts)
else:
msg = " : unknown url type"
if re.search('\(200\)', msg) or re.search('\(302\)', msg):
# print " found a good url so not printing it"
pass
else:
print " " + url + msg
urlseen[url] = msg
def check_http(url, parts):
# must do http connection using httplib so can parse the return codes
# only if the url is good should it be added to httplist
# print "entering check_http with url =", url
try:
# print "host =", parts[1]
# print "file =", parts[2]
signal.signal(signal.SIGALRM, handler)
signal.alarm(TIMEOUT)
h = httplib.HTTP(parts[1])
h.putrequest('HEAD', url)
h.putheader('Accept', '*/*')
h.endheaders()
errcode, errmsg, headers = h.getreply()
signal.alarm(0)
# print " errcode =",errcode
# print " errmsg =",errmsg
# print " headers =",headers
if errcode == 200 or errcode == 302:
msg = " : (" + str(errcode) + ") " + errmsg
add_url(url)
elif errcode == 301:
headers = str(headers)
#print " headers =", headers
result = re.findall("Location:\s*(.*)\n", headers)
if result[0]:
msg = " : Error = (" + str(errcode) + ") " + errmsg + ". New URL: " + result[0]
else:
msg = " : Error = (" + str(errcode) + ") " + errmsg
elif errcode == 400 or errcode == 403 or errcode == 404:
signal.signal(signal.SIGALRM, handler)
signal.alarm(TIMEOUT)
h = httplib.HTTP(parts[1])
h.putrequest('HEAD', parts[2])
h.endheaders()
errcode, errmsg, headers = h.getreply()
signal.alarm(0)
# print " headers =", headers
if errcode == 200 or errcode == 302:
add_url(url)
msg = " : (" + str(errcode) + ") " + errmsg
else:
msg = " : Error = (" + str(errcode) + ") " + errmsg
else:
msg = " : Error = (" + str(errcode) + ") " + errmsg
except IOError, arg:
msg = " : IOError: " + arg.args[0]
except socket.error, arg:
msg = " : Error: " + str(arg.args)
return msg
def ftp_file_exists(string):
kluge[0] = 1
def check_ftp(url, parts):
try:
signal.signal(signal.SIGALRM, handler)
signal.alarm(TIMEOUT)
ftp = ftplib.FTP(parts[1])
ftp.login()
# listcmd = "LIST " + parts[2]
# print " listcmd =", listcmd
#ftp.retrlines(listcmd)
kluge[0] = 0
ftp.dir(parts[2], ftp_file_exists)
if kluge[0]:
msg = " : is ok"
else:
msg = " : Error: file doesn't exist"
ftp.quit()
signal.alarm(0)
except socket.error, arg:
msg = " : Error: " + str(arg.args)
except (IOError, ftplib.error_perm, ftplib.error_temp, EOFError), arg:
msg = " : Error: " + str(arg)
#retrlines (command[, callback])
#dir (argument[, ...])
#cwd (pathname)
return msg
def add_url(url):
for x in require:
if not re.search(x, url):
# print " ", url, "is missing", x
return
for x in ignore:
if re.search(x, url):
# print " ", url, "includes", x, "which is being ignored"
return
httplist.append(url)
# BEGIN MAIN PROGRAM
debug=0
kluge = []
kluge.append(0)
if (len(sys.argv) > 1):
starturl = sys.argv[1]
else:
print 'Error: first argument should be the starting URL'
sys.exit()
httplist = []
httpdone = {}
urlseen = {}
ignore = ['/Packages', 'News/weekly/oldurl', '/Lists-Archives']
require = ['www.debian.org']
#print "Looking into", starturl
#check_url(starturl)
#sys.exit()
httplist.append(starturl)
if len(httplist):
url = httplist.pop(0)
else:
print "Error: no http urls"
sys.exit()
while (1):
do_page(url)
httpdone[url] = 1
if (len(httplist) > 0):
url = httplist.pop(0)
else:
break
print "Program Finished Normally"
#!/usr/bin/python
# Copyright 2001-2003 James A. Treacy
# This code may be distributed under the terms of the GPL
import sys, urllib, sgmllib, htmllib, httplib, ftplib, formatter, urlparse, socket;
import re, signal, getopt;
TIMEOUT = 15
debug=0
def handler(signum, frame):
# print 'Signal handler called with signal', signum
raise IOError, "gave up on site (" + repr(TIMEOUT) + " second limit)"
def do_page(page_url):
print "Looking into", page_url
try:
signal.signal(signal.SIGALRM, handler)
signal.alarm(TIMEOUT)
current = urllib.urlopen(page_url)
signal.alarm(0)
# print current.geturl()
# print current.read()
except (IOError, EOFError), arg:
print "Error accessing page:", arg.args[0]
return
parse = htmllib.HTMLParser(formatter.NullFormatter())
try:
parse.feed(current.read())
parse.close()
except (AttributeError, IOError, TypeError, ValueError, sgmllib.SGMLParseError):
print " Error reading page:", page_url
#if debug:
# print parse.anchorlist
for url in parse.anchorlist:
parts = urlparse.urlparse(url)
parts = parts[0], parts[1], parts[2], parts[3], parts[4], ''
urlnew = urlparse.urlunparse(parts)
# print "TESTING:", url, urlnew
url = urlparse.urljoin(page_url, urlnew)
# print url
check_url(url)
httpdone[url] = 1
def check_url(url):
# check url exist
parts = urlparse.urlparse(url)
if not parts[0]:
print " ", url, " : Error: invalid url"
return
# if re.search('//', parts[2]):
# print " ", url, "Error: too many '/'"
path_tmp = re.sub('//+', '/', parts[2])
query_tmp = re.sub('&amp;','&',parts[4])
parts = parts[0], parts[1], path_tmp, parts[3], query_tmp, ''
urlnew = urlparse.urlunparse(parts)
if debug:
if urlnew != url:
print "DEBUG: ", url, "converted to", urlnew
url = urlnew
if urlseen.has_key(url):
if re.search('\(200\)', urlseen[url]) or re.search('\(302\)', urlseen[url]):
# print " found a good url so not printing it"
pass
else:
print " " + url + urlseen[url]
sys.stdout.flush()
return
else:
msg = "If you see this, then msg didn't get set properly"
if parts[0] == 'ftp':
msg = check_ftp(url, parts)
elif parts[0] == 'https':
msg = " : can't check https for validity"
elif parts[0] == 'mailto':
msg = " : can't check mailto for validity"
elif parts[0] == 'news':
msg = " : can't check news for validity"
elif parts[0] == 'file':
try:
current = urllib.urlopen(url)
httplist.append(url)
msg = " : is ok"
except IOError, arg:
msg = " : Error: "+ arg.args[1]
elif parts[0] == 'http':
msg = check_http(url, parts)
else:
msg = " : unknown url type"
if re.search('\(200\)', msg) or re.search('\(302\)', msg):
# print " found a good url so not printing it"
pass
else:
print " " + url + msg
sys.stdout.flush()
urlseen[url] = msg
def check_http(url, parts):
# must do http connection using httplib so can parse the return codes
# only if the url is good should it be added to httplist
# print "entering check_http with url =", url
try:
# print "host =", parts[1]
# print "file =", parts[2]
# print "url =", url
# print "parts =", parts
if parts[1] in noncompliant:
return " : Error: site uses a non-compliant server. Not checking"
signal.signal(signal.SIGALRM, handler)
signal.alarm(TIMEOUT)
h = httplib.HTTP(parts[1])
h.putrequest('HEAD', parts[2])
h.putheader('Host', parts[1])
# h.putheader('Accept', '*/*')
h.endheaders()
errcode, errmsg, headers = h.getreply()
signal.alarm(0)
# print " errcode =",errcode
# print " errmsg =",errmsg
# print " headers =",headers
if errcode == 200 or errcode == 302:
msg = " : (" + str(errcode) + ") " + errmsg
headers = str(headers)
#print " headers =", headers
type = re.findall("Content-Type:\s*text/html.*\n", headers)
if len(type) or errcode == 302:
add_url(url)
#else:
#print "not adding " + url + ", not html\n"
elif errcode == 301:
headers = str(headers)
#print " headers =", headers
result = re.findall("Location:\s*(.*)\n", headers)
if result[0]:
msg = " : Error = (" + str(errcode) + ") " + errmsg + ". New URL: " + result[0]
else:
msg = " : Error = (" + str(errcode) + ") " + errmsg
elif errcode == 400 or errcode == 403 or errcode == 404:
# print "TRYING AGAIN"
signal.signal(signal.SIGALRM, handler)
signal.alarm(TIMEOUT)
h = httplib.HTTP(parts[1])
h.putrequest('HEAD', url)
h.putheader('Host', parts[1])
h.endheaders()
errcode, errmsg, headers = h.getreply()
signal.alarm(0)
# print " headers =", headers
if errcode == 200 or errcode == 302:
add_url(url)
msg = " : (" + str(errcode) + ") " + errmsg
else:
msg = " : Error = (" + str(errcode) + ") " + errmsg
else:
msg = " : Error = (" + str(errcode) + ") " + errmsg
except IOError, arg:
msg = " : IOError: " + arg.args[0]
except socket.error, arg:
msg = " : Error: " + str(arg.args)
except ValueError:
msg = " : Error: URL not valid "
return msg
def ftp_file_exists(string):
kluge[0] = 1
def check_ftp(url, parts):
try:
signal.signal(signal.SIGALRM, handler)
signal.alarm(TIMEOUT)
ftp = ftplib.FTP(parts[1])
ftp.login<