Corpora: Re: Html concordancing

Noord G.J.M. van vannoord at let.rug.nl
Wed May 10 13:19:12 UTC 2000


below is a web-script that does something similar to webcorp. 

-- 
Gertjan van Noord Alfa-informatica, RUG,  Postbus 716, 9700 AS Groningen
vannoord at let dot rug dot nl            http://www.let.rug.nl/~vannoord/



#!/usr/local/bin/perl -w
# © Gertjan van Noord, 1997.
# mailto:vannoord at let.rug.nl
BEGIN {
  use CGI::Carp qw(carpout);
  open(LOG,">>/users1/vannoord/tmp/log/nc")
    or die "Unable to open /users1/vannoord/tmp/log/nc: $!\n";
  carpout(*LOG);
}

use strict;
use CGI qw(:standard);
use LWP::UserAgent;
use HTML::LinkExtor;
use HTML::TreeBuilder;
use HTML::FormatText;

my %languages= (
'XX'  => 'Any language',
'zh'  => 'Chinese',
'cs'  => 'Czech',
'da'  => 'Danish',
'nl'  => 'Dutch',
'en'  => 'English',
'et'  => 'Estonian',
'fi'  => 'Finnish',
'fr'  => 'French',
'de'  => 'German',
'el'  => 'Greek',
'he'  => 'Hebrew',
'hu'  => 'Hungarian',
'is'  => 'Icelandic',
'it'  => 'Italian',
'ja'  => 'Japanese',
'ko'  => 'Korean',
'lv'  => 'Latvian',
'lt'  => 'Lithuanian',
'no'  => 'Norwegian',
'pl'  => 'Polish',
'pt'  => 'Portuguese',
'ro'  => 'Romanian',
'ru'  => 'Russian',
'es'  => 'Spanish',
'sv'  => 'Swedish',
	      );

my ($url,$ignore_url, at links,$word);

param('nr',0) unless param('nr');
param('nr',200) unless param('nr') < 200;

param('lang','nl') unless param('lang');
param('lang','XX') unless exists $languages{param('lang')};

$word=param("word");

my %engines = 
  (
   'surfnet' => sub {
     $url='http://search.surfnet.nl/cgi-bin/search.pl?nbq=10&fmt=c&lang='.
       param('lang').'&Web=on&zoekterm=' . $word . '&stq='.param('nr');
     $ignore_url='^[/#]|surfnet.nl|pdf|mailto:|altavista';
   },
   'altavista' => sub {
     $url='http://www.altavista.com/cgi-bin/query?pg=q&kl='.param('lang').
       '&text=yes&q=' . $word . '&stq='.param('nr');
     $ignore_url='^[/#]|www.altavista.com';
   },
   'telegraaf' => sub {
     $url='http://green.telegraafnet.nl/?query='.$word .
       '&page='.int((param('nr')+10)/10);
     $ignore_url='^[/#]|telegraaf|www.autovisie.nl|www.prive.nl|www.linux.org';
   }
  );

if (param('engine')) {
  param('engine','surfnet') unless exists $engines{param('engine')};
}

my $opt_s='(?:(?:[\.\?\!][\'\"]*)\s+|(?:[\t ]*\n\n\s*))';
                         # end-of-sentence (wie het weet mag het zeggen).
                         # herkent geen afkortingen...

print header,
      start_html(-'title'=>'NetKwic',
                 -'author'=>'vannoord at let.rug.nl',
                 -'style'=>{'src'=>'/~vannoord/vn.css'}),
      h1('NetKwic'),
      start_form,
      "Type in a word:\n",
      textfield(-'name'=>'word'),
      radio_group(-'name'=>'engine',
		  -'values'=>['surfnet','altavista','telegraaf'],
		  -'default'=>'surfnet'),
      popup_menu(-'name'=>'lang',
		 -'values'=> [
			      sort { $languages{$a} cmp 
				     $languages{$b} }(keys(%languages))],
                 -'default'=>'nl',
                 -'labels'=>\%languages),
      submit();

if ($word && param('engine')) {
  my $ua = new LWP::UserAgent;
  $ua->agent('NetKwic © 1997 G.J. van Noord, vannoord at let.rug.nl'.$ua->agent);
  $ua->timeout(10);
  $ua->max_size(30000);

  $engines{param('engine')}->();

  my $self = self_url;
  $self =~ s/&nr=[0-9]+//g;

  print "<a href=", 
        $self,
        "&nr=",
        param('nr')+10,
        ">More with <i>$word</i></a><br><p><ol>\n";

  # links extractor. 
  my $p = HTML::LinkExtor->new( sub {
				  my ($tag,$attr,$link)=@_;
				  if ( $tag eq 'a'    &&
				      $attr eq 'href' &&
				      $link !~ /$ignore_url/o) {
				    push(@links,$link);  
				  }
				});

  my $req=new HTTP::Request GET => $url;
  my $res = $ua->request($req,sub {$p->parse($_[0])});
  die "Error: " . $res->code . " " . $res->message ."\n" if $res->is_error;
  my $link;
  foreach $link (@links) {
    printf("<li><a href=%s>%s</a><br>\n<table>\n",$link,$link);
    $req = new HTTP::Request GET => $link;
    $res = $ua->request($req);
    if ($res->is_success) {
      if ($res->header('Content_Type') =~ m|text/|) {
	my $html=HTML::TreeBuilder->new();
	$html->parse($res->content);
	my $formatter=HTML::FormatText->new();
	my $input = $formatter->format($html);
	$input =~ tr/\r//d;      # does this get through?
	$input =~ s/\s*[-=][-=][-=]+\s*/\n\n/sg;
	$input =~ s/\[IMAGE\]/\n\n/sg;
	$input =~ s/\n +/\n/sg;
	$input =~ s/\[TABLE NOT SHOWN\]/\n\n/sg;
	my $znr;
	while ($input =~ /.*?(?:$opt_s)/sg) {
	  $_=$&;
	  $znr++;
	  tr/\n\t / /s;
	  if (/$word/io) {
	    s/($word)/<b>$1<\/b>/ig;
	    printf("<tr><td>%s</td><td>%s</td></tr>",$znr,$_);
	  }
	}
	print "</table>\n";
	if ($res->header('X-Content-Range')) {
	  print "(truncated)";
	}
      } else { #not text
	print "</table><i>\n";
	print $res->header('Content_Type');
        print "</i></li>\n";
      }
    } else {   #link failed
      print "</table><i>\n";
      print $res->code() . ": ".$res->message();
      print "</i></li>\n";
    }
    print "</li>\n";
  }
}
print end_form;

print "</ol>", hr(), end_html;



More information about the Corpora mailing list