i18nwwwfix.pl 5.56 KB
Newer Older
Bas Zoetekouw's avatar
Bas Zoetekouw committed
1
#!/usr/bin/perl
2 3 4 5 6 7

# This script enables the creation of copies of the www site
# in only two languages. Default language (see $DEFAULT)
# and provided language (-p switch) giving preference to
# the later.

8
# (c) Javier Fernández-Sanguino Peña <jfs@debian.org>
9 10 11 12 13 14 15 16 17 18 19 20 21 22
# Distributed under the GNU GPL License (see http://www.gnu.org/gpl)

# It fixes all the href links in the current directory
# and all under it depending on the files that exist.
# Current options
# -p (obliged): tells which preferred language to use
# -v (optional): activates verbose output


# In order to retrieve Debian's website try something
# like:
# (for Spanish users)
# 1.- wget -o debian.log -m -k http://www.es.debian.org
# 2.- (go to the dir created by wget, in our case www.es.debian.org)
Bas Zoetekouw's avatar
Bas Zoetekouw committed
23
# 3.- perl intcopy.pl -p es
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
#
# NOTES:
# 1.- Customize the URL and -p option to fix  for your closest mirror and
# language
# 2.- after doing this you can remove all other languages
# besides yours (anyone care to give an easy bash line here?)
# 3.- afterwards check all URL (try checkbot) and send any bugs regarding
# bad fixed links to me.



# TODO:
# 1.- It currently does not understand # in links and fixes
# them incorrectly


use Getopt::Std;
use IO::File;
Bas Zoetekouw's avatar
Bas Zoetekouw committed
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
use Cwd;
use File::Copy;

use strict;
use warnings;

my %opts;
getopts('vp:d:', \%opts);

my $POST         = $opts{'p'};
my $DEFAULT      = "en"; #Default language is english (en)
my $INVALID_DIRS = '^\.|\.\.|CVS|\.svn|.git$';

my $current_dir = $opts{'d'} || getcwd;
my $verbose     = $opts{'v'};

fixDirectory($current_dir);
59 60 61

exit 0;

Bas Zoetekouw's avatar
Bas Zoetekouw committed
62 63

sub fixDirectory
64 65 66
{
	my ($directory) = @_;
	my $dir = new IO::File;
Bas Zoetekouw's avatar
Bas Zoetekouw committed
67 68
	opendir ($dir, $directory) || die ("I cannot read $directory: $!\n");
	while ( my $file = readdir ($dir) )
69
	{
Bas Zoetekouw's avatar
Bas Zoetekouw committed
70 71 72 73 74 75 76 77 78 79 80 81 82 83
		next  if  $file eq '.' or $file eq '..';

		warn "Checking $file\n" if $verbose;

		if ( -d "${directory}/${file}" and not -l "${directory}/${file}" )
		{
			if ( $file =~ /$INVALID_DIRS/ )
			{
				warn "Not a valid dir: $file \n" if $verbose;
			}
			else
			{
				fixDirectory ("${directory}/${file}");
			}
84 85
		}
		else
Bas Zoetekouw's avatar
Bas Zoetekouw committed
86 87 88
		{
			fix_html_file (${directory},"${directory}/${file}") if $file =~ /.html?$/ ;
		}
89 90 91
	} # del while
} #de la subrutina

Bas Zoetekouw's avatar
Bas Zoetekouw committed
92

93 94
sub fix_html_file
{
Bas Zoetekouw's avatar
Bas Zoetekouw committed
95
	# This is a html file
96
	my ($directory,$file) = @_;
Bas Zoetekouw's avatar
Bas Zoetekouw committed
97 98 99 100 101 102 103 104

	warn "Opening the file $file.\n" if $verbose;

	open (FICHERO, "<${file}") or die ("Cannot open ${file} : $!\n");
	open (NEWFICHERO, ">${file}.bak") or die ("I cannot create a backup of ${file} : $!\n");

	while ( my $line =<FICHERO>)
	{
105
		chomp $line;
Bas Zoetekouw's avatar
Bas Zoetekouw committed
106 107 108 109 110 111 112 113 114 115 116 117

		# Here we must check:
		# 1.- the href ends in .$post.html and $POST  = $post and if not
		# cancel the href (remove the tag)
		# 2.- if the href does not end in $post.html and $POST.html exists
		# make it point there
		# 3.- if the href does not end in $post.html and $POST.html does not
		# exist then link to .en.html (english version)

		my $newline = "";
		my $endofline = "";

118
		while ( $line =~ m/A HREF=\"(.*?)\"/gi )
Bas Zoetekouw's avatar
Bas Zoetekouw committed
119
		{
120 121 122 123
			my $old_ref = $1;
			my $new_ref = $old_ref;
			$newline = $newline.$`;
			$endofline = $';
Bas Zoetekouw's avatar
Bas Zoetekouw committed
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145

			if ( islocalreference($old_ref) )
			{
				warn "Checking reference $old_ref\n" if $verbose;
				if ( $old_ref =~ /\/$/ )
				{
					# This is a directory... check if the file exists
					warn "Fixing directory reference $old_ref\n" if $verbose;

					if ( -f "${directory}/${old_ref}/index.$POST.html" )
					{
						$new_ref = $old_ref."index.".$POST.".html";
					}
					if ( $new_ref eq $old_ref
					     and  -f "${directory}/${old_ref}/index.$DEFAULT.html" )
					{
						$new_ref = $old_ref."index.".$DEFAULT.".html";
					}
					if ( $new_ref eq $old_ref and -f "${directory}/${old_ref}/index.html" )
					{
						$new_ref = $old_ref."index.html";
					}
146
				}
Bas Zoetekouw's avatar
Bas Zoetekouw committed
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
				elsif ( $old_ref =~ /(.*?)\.(.*?)\.html$/ )
				{
					# This one uses does *not* use content negotiation...
					warn "Fixing HTML reference $old_ref\n" if $verbose;

					my $base = $1;
					my $ending = $2;

					if ( -f "${directory}/${base}.$POST.html" )
					{
						$new_ref = $base.".".$POST.".html";
					}
					if ( $new_ref eq $old_ref && -f "${directory}/${base}.$DEFAULT.html" )
					{
						$new_ref = $base.".".$DEFAULT.".html";
					}
163 164
				}
				elsif ( $old_ref !~ /([\w-]+)\.([\w-]+)$/ ) {
Bas Zoetekouw's avatar
Bas Zoetekouw committed
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
					warn "Fixing Content Negotiation reference $old_ref\n" if $verbose;

					# This one uses *does* use content negotiation...
					# Check as above but also move around files
					if ( -f "${directory}/${old_ref}.$POST.html" )
					{
						$new_ref = $old_ref.".".$POST.".html";
					}
					if ( "$new_ref eq $old_ref && -f ${directory}/${old_ref}.$DEFAULT.html" )
					{
						$new_ref = $old_ref.".".$DEFAULT.".html";
					}
					if ( "$new_ref eq $old_ref && -f ${directory}/${old_ref}.html" )
					{
						$new_ref = $old_ref.".html"
					}
					if ( "$new_ref eq $old_ref && -f ${directory}/${old_ref}" )
					{
						$new_ref = $old_ref.".html";
					}
185 186
				}
			}
Bas Zoetekouw's avatar
Bas Zoetekouw committed
187 188 189 190 191 192 193

			# After checking if $old_ref =/= $new_ref then substitute
			$newline .= qq{A HREF="$new_ref"};
			if ( $verbose and $new_ref ne $old_ref)
			{
				warn "Fixed reference $old_ref to $new_ref\n";
			}
194
		}
Bas Zoetekouw's avatar
Bas Zoetekouw committed
195
		$newline .= $endofline;
196
		$newline = $line if $newline eq "";
Bas Zoetekouw's avatar
Bas Zoetekouw committed
197 198

		warn "Changing $line to $newline\n" if $verbose;
199 200 201 202 203
		print NEWFICHERO $newline;
		print NEWFICHERO "\n";
	}
	close FICHERO;
	close NEWFICHERO;
Bas Zoetekouw's avatar
Bas Zoetekouw committed
204

205
	unlink $file;
Bas Zoetekouw's avatar
Bas Zoetekouw committed
206 207
	move("$file.bak", $file) 
		or die("Couldn't move `$file.bak' to `$file': $!\n");
208 209
}

Bas Zoetekouw's avatar
Bas Zoetekouw committed
210
# Checks if a reference points to a local resource,
211
# i.e. it is not in (http|ftp|gopher):// form
Bas Zoetekouw's avatar
Bas Zoetekouw committed
212 213
sub islocalreference
{
214
	my ($reference) = @_;
Bas Zoetekouw's avatar
Bas Zoetekouw committed
215 216 217
	if ($reference !~ /:\/\// )
	{
		warn "Local reference: $reference\n" if $verbose;
218
		return 1;
Bas Zoetekouw's avatar
Bas Zoetekouw committed
219 220
	}
	return;
221
}
Bas Zoetekouw's avatar
Bas Zoetekouw committed
222 223 224

__END__