Corpora: Re: Html concordancing
Noord G.J.M. van
vannoord at let.rug.nl
Wed May 10 13:19:12 UTC 2000
below is a web-script that does something similar to webcorp.
--
Gertjan van Noord Alfa-informatica, RUG, Postbus 716, 9700 AS Groningen
vannoord at let dot rug dot nl http://www.let.rug.nl/~vannoord/
#!/usr/local/bin/perl -w
# © Gertjan van Noord, 1997.
# mailto:vannoord at let.rug.nl
BEGIN {
use CGI::Carp qw(carpout);
open(LOG,">>/users1/vannoord/tmp/log/nc")
or die "Unable to open /users1/vannoord/tmp/log/nc: $!\n";
carpout(*LOG);
}
use strict;
use CGI qw(:standard);
use LWP::UserAgent;
use HTML::LinkExtor;
use HTML::TreeBuilder;
use HTML::FormatText;
my %languages= (
'XX' => 'Any language',
'zh' => 'Chinese',
'cs' => 'Czech',
'da' => 'Danish',
'nl' => 'Dutch',
'en' => 'English',
'et' => 'Estonian',
'fi' => 'Finnish',
'fr' => 'French',
'de' => 'German',
'el' => 'Greek',
'he' => 'Hebrew',
'hu' => 'Hungarian',
'is' => 'Icelandic',
'it' => 'Italian',
'ja' => 'Japanese',
'ko' => 'Korean',
'lv' => 'Latvian',
'lt' => 'Lithuanian',
'no' => 'Norwegian',
'pl' => 'Polish',
'pt' => 'Portuguese',
'ro' => 'Romanian',
'ru' => 'Russian',
'es' => 'Spanish',
'sv' => 'Swedish',
);
my ($url,$ignore_url, at links,$word);
param('nr',0) unless param('nr');
param('nr',200) unless param('nr') < 200;
param('lang','nl') unless param('lang');
param('lang','XX') unless exists $languages{param('lang')};
$word=param("word");
my %engines =
(
'surfnet' => sub {
$url='http://search.surfnet.nl/cgi-bin/search.pl?nbq=10&fmt=c&lang='.
param('lang').'&Web=on&zoekterm=' . $word . '&stq='.param('nr');
$ignore_url='^[/#]|surfnet.nl|pdf|mailto:|altavista';
},
'altavista' => sub {
$url='http://www.altavista.com/cgi-bin/query?pg=q&kl='.param('lang').
'&text=yes&q=' . $word . '&stq='.param('nr');
$ignore_url='^[/#]|www.altavista.com';
},
'telegraaf' => sub {
$url='http://green.telegraafnet.nl/?query='.$word .
'&page='.int((param('nr')+10)/10);
$ignore_url='^[/#]|telegraaf|www.autovisie.nl|www.prive.nl|www.linux.org';
}
);
if (param('engine')) {
param('engine','surfnet') unless exists $engines{param('engine')};
}
my $opt_s='(?:(?:[\.\?\!][\'\"]*)\s+|(?:[\t ]*\n\n\s*))';
# end-of-sentence (wie het weet mag het zeggen).
# herkent geen afkortingen...
print header,
start_html(-'title'=>'NetKwic',
-'author'=>'vannoord at let.rug.nl',
-'style'=>{'src'=>'/~vannoord/vn.css'}),
h1('NetKwic'),
start_form,
"Type in a word:\n",
textfield(-'name'=>'word'),
radio_group(-'name'=>'engine',
-'values'=>['surfnet','altavista','telegraaf'],
-'default'=>'surfnet'),
popup_menu(-'name'=>'lang',
-'values'=> [
sort { $languages{$a} cmp
$languages{$b} }(keys(%languages))],
-'default'=>'nl',
-'labels'=>\%languages),
submit();
if ($word && param('engine')) {
my $ua = new LWP::UserAgent;
$ua->agent('NetKwic © 1997 G.J. van Noord, vannoord at let.rug.nl'.$ua->agent);
$ua->timeout(10);
$ua->max_size(30000);
$engines{param('engine')}->();
my $self = self_url;
$self =~ s/&nr=[0-9]+//g;
print "<a href=",
$self,
"&nr=",
param('nr')+10,
">More with <i>$word</i></a><br><p><ol>\n";
# links extractor.
my $p = HTML::LinkExtor->new( sub {
my ($tag,$attr,$link)=@_;
if ( $tag eq 'a' &&
$attr eq 'href' &&
$link !~ /$ignore_url/o) {
push(@links,$link);
}
});
my $req=new HTTP::Request GET => $url;
my $res = $ua->request($req,sub {$p->parse($_[0])});
die "Error: " . $res->code . " " . $res->message ."\n" if $res->is_error;
my $link;
foreach $link (@links) {
printf("<li><a href=%s>%s</a><br>\n<table>\n",$link,$link);
$req = new HTTP::Request GET => $link;
$res = $ua->request($req);
if ($res->is_success) {
if ($res->header('Content_Type') =~ m|text/|) {
my $html=HTML::TreeBuilder->new();
$html->parse($res->content);
my $formatter=HTML::FormatText->new();
my $input = $formatter->format($html);
$input =~ tr/\r//d; # does this get through?
$input =~ s/\s*[-=][-=][-=]+\s*/\n\n/sg;
$input =~ s/\[IMAGE\]/\n\n/sg;
$input =~ s/\n +/\n/sg;
$input =~ s/\[TABLE NOT SHOWN\]/\n\n/sg;
my $znr;
while ($input =~ /.*?(?:$opt_s)/sg) {
$_=$&;
$znr++;
tr/\n\t / /s;
if (/$word/io) {
s/($word)/<b>$1<\/b>/ig;
printf("<tr><td>%s</td><td>%s</td></tr>",$znr,$_);
}
}
print "</table>\n";
if ($res->header('X-Content-Range')) {
print "(truncated)";
}
} else { #not text
print "</table><i>\n";
print $res->header('Content_Type');
print "</i></li>\n";
}
} else { #link failed
print "</table><i>\n";
print $res->code() . ": ".$res->message();
print "</i></li>\n";
}
print "</li>\n";
}
}
print end_form;
print "</ol>", hr(), end_html;
More information about the Corpora
mailing list