NAME
Ufal::NameTag - bindings to NameTag library http://ufal.mff.cuni.cz/nametag.
SYNOPSIS
use Ufal::NameTag;
my = 'czech-cnec2.0-140304.ner';
my = Ufal::NameTag::Ner::load() or die "Cannot load NER from file ''\n";
my = Ufal::NameTag::Forms->new(); ->push(./gen.sh) for qw(Jan Hus bydlel v Praze .);
my = Ufal::NameTag::NamedEntities->new();
->recognize(, );
for my (0 .. ->size()-1) {
my = ->get();
printf "entity of type %s: '%s'\n", ->{type},
join(' ', map {->get(./gen.sh)} (->{start} .. ->{start}+->{length}-1));
}
REQUIREMENTS
To compile the module, C++11 compiler is needed, either g++
4.7 or newer, clang
3.2 or newer or Visual Studio 2015
.
DESCRIPTION
Ufal::NameTag
is a Perl binding to NameTag library http://ufal.mff.cuni.cz/nametag.
All classes can be imported into the current namespace using the all
export tag.
The bindings is a straightforward conversion of the C++
bindings API. Vectors do not have native Perl interface, see Ufal::NameTag::Forms source for reference. Static methods and enumerations are available only through the module, not through object instance.
Wrapped C++ API
The C++ API being wrapped follows. For a API reference of the original C++ API, see L\<http://ufal.mff.cuni.cz/nametag/api-reference\>.
Helper Structures
-----------------
typedef vector<string> Forms;
struct TokenRange {
size_t start;
size_t length;
};
typedef vector<TokenRange> TokenRanges;
struct NamedEntity {
size_t start;
size_t length;
string type;
NamedEntity();
NamedEntity(size_t start, size_t length, const string& type);
};
typedef vector<NamedEntity> NamedEntities;
Main Classes
------------
class Version {
public:
unsigned major;
unsigned minor;
unsigned patch;
string prerelease;
static Version current();
};
class Tokenizer {
public:
virtual void setText(const char* text);
virtual bool nextSentence(Forms* forms, TokenRanges* tokens);
static Tokenizer* newVerticalTokenizer();
};
class Ner {
static ner* load(const char* fname);
virtual void recognize(Forms& forms, NamedEntities& entities) const;
virtual void entityTypes(Forms& types) const;
virtual void gazetteers(Forms& gazetteers, Ints& gazetteer_types) const;
virtual Tokenizer* newTokenizer() const;
};
Example
run_ner
Simple example performing named entity recognition.
use warnings;
use strict;
use open qw(:std :utf8);
use Ufal::NameTag;
sub encode_entities($) {
my ($text) = @_;
$text =~ s/[&<>"]/$& eq "&" ? "&" : $& eq "<" ? "<" : $& eq ">" ? ">" : """/ge;
return $text;
}
sub sort_entities($) {
my ($entities) = @_;
my @entities = ();
for (my ($i, $size) = (0, $entities->size()); $i < $size; $i++) {
push @entities, $entities->get($i);
}
return sort { $a->{start} <=> $b->{start} || $b->{length} <=> $a->{length} } @entities;
}
@ARGV >= 1 or die "Usage: $0 recognizer_model\n";
print STDERR "Loading ner: ";
my $ner = Ufal::NameTag::Ner::load($ARGV[0]);
$ner or die "Cannot load recognizer from file '$ARGV[0]'\n";
print STDERR "done\n";
shift @ARGV;
my $forms = Ufal::NameTag::Forms->new();
my $tokens = Ufal::NameTag::TokenRanges->new();
my $entities = Ufal::NameTag::NamedEntities->new();
my @sorted_entities;
my @open_entities;
my $tokenizer = $ner->newTokenizer();
$tokenizer or die "No tokenizer is defined for the supplied model!";
for (my $not_eof = 1; $not_eof; ) {
my $text = '';
# Read block
while (1) {
my $line = <>;
last unless ($not_eof = defined $line);
$text .= $line;
chomp($line);
last unless length $line;
}
# Tokenize and recognize
$tokenizer->setText($text);
my $t = 0;
while ($tokenizer->nextSentence($forms, $tokens)) {
$ner->recognize($forms, $entities);
@sorted_entities = sort_entities($entities);
# Write entities
for (my ($i, $size, $e) = (0, $tokens->size(), 0); $i < $size; $i++) {
my $token = $tokens->get($i);
my ($token_start, $token_length) = ($token->{start}, $token->{length});
print encode_entities(substr $text, $t, $token_start - $t);
print '<sentence>' if $i == 0;
# Open entities starting at current token
for (; $e < @sorted_entities && $sorted_entities[$e]->{start} == $i; $e++) {
printf '<ne type="%s">', encode_entities($sorted_entities[$e]->{type});
push @open_entities, $sorted_entities[$e]->{start} + $sorted_entities[$e]->{length} - 1;
}
# The token itself
printf '<token>%s</token>', encode_entities(substr $text, $token_start, $token_length);
# Close entities ending after current token
while (@open_entities && $open_entities[-1] == $i) {
print '</ne>';
pop @open_entities;
}
print '</sentence>' if $i + 1 == $size;
$t = $token_start + $token_length;
}
}
# Write rest of the text
print encode_entities(substr $text, $t);
}
AUTHORS
Milan Straka <straka@ufal.mff.cuni.cz>
Jana Straková <strakova@ufal.mff.cuni.cz>
COPYRIGHT AND LICENCE
Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of Mathematics and Physics, Charles University in Prague, Czech Republic.
This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.