#! /usr/bin/perl # # File: raw.pl # Time-stamp: <01/12/17 17:56:55 martinc> # $Id$ # # Copyright (C) 2001 University of Edinburgh # Author: Frank Keller, Garance Paris, Martin Corley # NB: If LC_CTYPE isn't set, this script will attempt to switch to an # ISO8859-15 locale (latin1 with a Euro character). If words are not # being tokenised as you expect, set LC_CTYPE to an appropriate value # before attempting to segment your text. require 5.004; use POSIX qw(locale_h); use Getopt::Std; getopts('dmcF:'); $path = $opt_d; $filename = $opt_m; $corpusname = $opt_c; $inputfile = $opt_F; unless ($ENV{'LC_CTYPE'}) { setlocale(LC_CTYPE,"ISO8859.15"); print STDERR "$0: no locale has been set (via LC_CTYPE). Using default\n"; print STDERR " ISO8859.15 character-set\n"; } else { setlocale(LC_CTYPE,""); } $date=localtime(); print "\% CORSET corpus file\n"; # print a header to keep gsearch happy print "\% Created from $corpusname\n"; print "\% Processed using the raw.pl filter\n"; print "\% Created on $date\n"; print "\% Corpus file: $inputfile\n"; print "\n"; print "\-$filename\n"; # file name open(INFILE, $inputfile) || # input file die "Can't open file $inputfile\n"; { use locale; # locale in force within the loop while ($line = ) { @words = split(/\s+/,$line); foreach $w (@words) { # separate punctuation from word ($punct1, $lex, $punct2) = ($w =~ /(\W*)(\w*)(\W*)/); # create one token per punctuation character @p1 = split(//, $punct1); foreach $p(@p1) { if (!$p eq "") { print "*\t$p\tPUN1\n"; }; } # output word if (!$lex eq "") { print "\t$lex\tUNK\n"; }; # create one token per punctuation character @p2 = split(//, $punct2); foreach $p(@p2) { if (!$p eq "") { print "*\t$p\tPUN2\n"; }; } # check for sentence boundary if ($p2[$#p2] =~ /\.|\!|\?/) { print "\n"; }; } } } close (INFILE);