#!/usr/bin/perl -w use strict; # Check a TAC 2009 KBP Track slot-filling task submission for various common errors, including: # * invalid run tag (should be a concatenation of your TAC 2009 team ID and the run number (1-3) # * multiple run tags # * invalid query id or slot name # * slot value link given for a non-linkable slot # * invalid entity id given for the slot value link # * missing value for a question (at least one response required per question) # * answer from an invalid document (approximate check) # [NB: "question" refers to a (query id, slot name) pair] # Messages regarding submission are printed to an error log # The submission file for the KBP slot-filling task has exactly one # response per single-valued slot and at least one response for each # list-valued slot, for each query (except that no response should be # given for slots listed in the 'ignore' list of the query). # A response is a line of the form # query_id slot_name run_id docid slot-value-link answer-string-as-the-remainder-of-the-line # OR # query_id slot_name run_id NIL # Usage is: # check_kbp_slot-filling.pl results_file # where results_file is the name of the results submission file to be checked # Change this variable to the directory where the error log should be put my $errlog_dir = "."; my $MAX_ERRORS = 25; # These values are specific to the TAC 2009 KBP track # test queries my %queries = ( SF1 => {type => "GPE", ignore => ["gpe:political_parties", "gpe:currency", "gpe:population", "gpe:capital"]}, SF2 => {type => "ORG", ignore => []}, SF3 => {type => "ORG", ignore => ["org:founded", "org:website", "org:headquarters"]}, SF4 => {type => "ORG", ignore => ["org:headquarters", "org:website"]}, SF5 => {type => "ORG", ignore => ["org:headquarters", "org:founded", "org:website"]}, SF6 => {type => "ORG", ignore => []}, SF7 => {type => "PER", ignore => []}, SF8 => {type => "PER", ignore => []}, SF9 => {type => "PER", ignore => ["per:date_of_birth", "per:origin", "per:age", "per:place_of_birth", "per:religion"]}, SF10 => {type => "PER", ignore => []}, SF11 => {type => "PER", ignore => ["per:date_of_birth", "per:age", "per:place_of_birth"]}, SF12 => {type => "PER", ignore => []}, SF13 => {type => "PER", ignore => []}, SF14 => {type => "PER", ignore => []}, SF15 => {type => "PER", ignore => []}, SF16 => {type => "PER", ignore => []}, SF17 => {type => "PER", ignore => []}, SF18 => {type => "ORG", ignore => []}, SF19 => {type => "GPE", ignore => ["gpe:political_parties", "gpe:capital", "gpe:population", "gpe:currency"]}, SF20 => {type => "ORG", ignore => []}, SF21 => {type => "ORG", ignore => []}, SF22 => {type => "PER", ignore => []}, SF23 => {type => "ORG", ignore => ["org:founded", "org:headquarters", "org:dissolved", "org:website"]}, SF24 => {type => "ORG", ignore => []}, SF25 => {type => "ORG", ignore => []}, SF26 => {type => "ORG", ignore => []}, SF27 => {type => "PER", ignore => ["per:place_of_birth", "per:age", "per:date_of_birth"]}, SF28 => {type => "ORG", ignore => []}, SF29 => {type => "ORG", ignore => []}, SF30 => {type => "ORG", ignore => ["org:headquarters", "org:founded", "org:dissolved"]}, SF31 => {type => "ORG", ignore => ["org:website", "org:founded", "org:number_of_employees/members"]}, SF32 => {type => "ORG", ignore => ["org:headquarters", "org:founded"]}, SF33 => {type => "ORG", ignore => ["org:founded", "org:number_of_employees/members", "org:headquarters"]}, SF34 => {type => "PER", ignore => ["per:date_of_death", "per:date_of_birth", "per:age"]}, SF35 => {type => "GPE", ignore => ["gpe:currency", "gpe:political_parties"]}, SF36 => {type => "ORG", ignore => ["org:website", "org:founded", "org:headquarters"]}, SF37 => {type => "ORG", ignore => []}, SF38 => {type => "ORG", ignore => []}, SF39 => {type => "ORG", ignore => []}, SF40 => {type => "PER", ignore => []}, SF41 => {type => "ORG", ignore => []}, SF42 => {type => "ORG", ignore => []}, SF43 => {type => "ORG", ignore => []}, SF44 => {type => "ORG", ignore => []}, SF45 => {type => "ORG", ignore => []}, SF46 => {type => "ORG", ignore => ["org:founded", "org:headquarters"]}, SF47 => {type => "PER", ignore => []}, SF48 => {type => "PER", ignore => []}, SF49 => {type => "ORG", ignore => ["org:founded", "org:headquarters"]}, SF50 => {type => "GPE", ignore => ["gpe:political_parties", "gpe:currency", "gpe:capital"]}, SF51 => {type => "ORG", ignore => []}, SF52 => {type => "ORG", ignore => []}, SF53 => {type => "GPE", ignore => ["gpe:currency", "gpe:established", "gpe:population"]}); # Slot Filler Type as defined in LDC publication # "TAC 2009 KBP Sample Generic Infoboxes"; Version 1.0; May 21, 2009 my %slots = ('PER' => {'per:alternate_names' => {quantity => 'list', linkable => 0}, 'per:date_of_birth' => {quantity => 'single', linkable => 0}, 'per:age' => {quantity => 'single', linkable => 0}, 'per:place_of_birth' => {quantity => 'single', linkable => 1}, 'per:origin' => {quantity => 'single', linkable => 0}, 'per:date_of_death' => {quantity => 'single', linkable => 0}, 'per:place_of_death' => {quantity => 'single', linkable => 1}, 'per:cause_of_death' => {quantity => 'single', linkable => 0}, 'per:residences' => {quantity => 'list', linkable => 1}, 'per:schools_attended' => {quantity => 'list', linkable => 1}, 'per:title' => {quantity => 'list', linkable => 0}, 'per:member_of' => {quantity => 'list', linkable => 1}, 'per:employee_of' => {quantity => 'list', linkable => 1}, 'per:religion' => {quantity => 'single', linkable => 0}, 'per:spouse' => {quantity => 'list', linkable => 1}, 'per:children' => {quantity => 'list', linkable => 1}, 'per:parents' => {quantity => 'list', linkable => 1}, 'per:siblings' => {quantity => 'list', linkable => 1}, 'per:other_family' => {quantity => 'list', linkable => 1}, 'per:charges' => {quantity => 'list', linkable => 0}}, 'ORG' => {'org:alternate_names' => {quantity => 'list', linkable => 0}, 'org:political/religious_affiliation' => {quantity => 'list', linkable => 0}, 'org:top_members/employees' => {quantity => 'list', linkable => 1}, 'org:number_of_employees/members' => {quantity => 'single', linkable => 0}, 'org:members' => {quantity => 'list', linkable => 1}, 'org:member_of' => {quantity => 'list', linkable => 1}, 'org:subsidiaries' => {quantity => 'list', linkable => 1}, 'org:parents' => {quantity => 'list', linkable => 1}, 'org:founded_by' => {quantity => 'list', linkable => 1}, 'org:founded' => {quantity => 'single', linkable => 0}, 'org:dissolved' => {quantity => 'single', linkable => 0}, 'org:headquarters' => {quantity => 'single', linkable => 1}, 'org:shareholders' => {quantity => 'list', linkable => 1}, 'org:website' => {quantity => 'single', linkable => 0}}, 'GPE' => {'gpe:alternate_names' => {quantity => 'list', linkable => 0}, 'gpe:capital' => {quantity => 'single', linkable => 1}, 'gpe:subsidiary_orgs' => {quantity => 'list', linkable => 1}, 'gpe:top_employees' => {quantity => 'list', linkable => 1}, 'gpe:political_parties' => {quantity => 'list', linkable => 1}, 'gpe:established' => {quantity => 'single', linkable => 0}, 'gpe:population' => {quantity => 'single', linkable => 0}, 'gpe:currency' => {quantity => 'single', linkable => 0}}); my $results_file; # input file name my %qids; # number of answers returned for question my %nils; # number of NIL answers returned for question my ($errlog,$num_errors,$line_num); my ($run_id, $tag); my ($q,$slot_name,$slot_value_link,$docid, $answer); my ($i, $last_i, $line); if ($#ARGV != 0) { print STDERR "Usage: $0 resultsfile\n"; die "\n"; } $results_file = $ARGV[0]; $num_errors = 0; # set up output files $last_i = -1; while ( ($i=index($results_file,"/",$last_i+1)) > -1) { $last_i = $i; } $errlog = $errlog_dir . "/" . substr($results_file,$last_i+1) . ".errlog"; open ERRLOG, ">$errlog" || die "Cannot open error log for writing\n"; open RESULTS, "<$results_file" || die "Unable to open results file $results_file: $!"; $line_num = 0; $run_id = ""; while ($line = ) { chomp $line; $line_num++; next if ($line =~ /^\s*$/); undef $slot_name; undef $tag; undef $docid, undef $slot_value_link; undef $answer; ($q, $slot_name, $tag, $docid, $slot_value_link, $answer) = split " ", $line, 6; if (!defined $docid || length($docid) == 0 || !defined $tag || !defined $slot_name) { &error("Wrong number of fields -- missing fields"); next; } # make sure runtag is ok if (! $run_id) { # very first line --- remember tag $run_id = $tag; if ($run_id !~ /^[A-Za-z0-9._]{1,12}[1-3]$/) { &error("Run tag `$run_id' is malformed)"); next; } } else { # otherwise just make sure one tag used if ($tag ne $run_id) { &error("Run tag inconsistent (`$tag' and `$run_id')"); next; } } # get query id if (!defined $queries{$q}) { &error("Invalid query id ($q)"); next; } # get slot name if (!defined $slots{$queries{$q}{'type'}}{$slot_name}) { &error("Invalid slot name $slot_name for query $q with entity type $queries{$q}{'type'}"); next; } $qids{$q}{$slot_name}++; if ($docid ne "NIL") { # make sure docid valid--- # this is only an approximate test since so many docs for KBP track if ($docid !~ /\.LDC/) { &error("Unknown document `$docid'"); next; } # make sure slot value link is valid if (!defined $slot_value_link) { &error("Missing slot value link for non-NIL slot value"); next; } elsif ($slot_value_link ne "NIL" && $slot_value_link !~ /^E[0-9]{7}$/) { &error("Unrecognizable KB node id ($slot_value_link) for slot value link"); next; } # make sure slot is linkable if non-NIL link is returned if ($slot_value_link ne "NIL" && ! $slots{$queries{$q}{'type'}}{$slot_name}{'linkable'}) { &error("Non-NIL link given for non-linkable slot $slot_name"); next; } # make sure answer exists if (!defined $answer || length($answer) == 0) { &error("Missing answer-string for slot $slot_name for query $q"); next; } } else { if ($answer) { &error ("Answer string given when docid is NIL"); next; } $answer = ""; if ($slot_value_link) { &error ("Slot value link given when docid is NIL"); next; } $slot_value_link = ""; $nils{$q}{$slot_name}++; } } # Do global checks: # error if some question that is not in the 'ignore' list has no response given for it # error if some question that is in the 'ignore' list has a response given for it # error if single-valued question has more than one response given for it # error if more than one response is given for a question where NIL has been given as an answer foreach $q (keys %queries) { foreach $slot_name (keys %{$slots{$queries{$q}{'type'}}}) { if (! defined $qids{$q}{$slot_name} && -1 == &is_member($slot_name, \@{$queries{$q}{'ignore'}})) { &error("No response given for slot $slot_name for query $q of type $queries{$q}{'type'}"); } if (defined $qids{$q}{$slot_name} && -1 != &is_member($slot_name, \@{$queries{$q}{'ignore'}})) { &error("$qids{$q}{$slot_name} responses given for slot $slot_name, which should be ignored for query $q"); } if ($slots{$queries{$q}{'type'}}{$slot_name}{'quantity'} eq "single" && defined $qids{$q}{$slot_name} && $qids{$q}{$slot_name} > 1) { &error("More than one response given for single-valued slot $slot_name for query $q"); } if (defined $qids{$q}{$slot_name} && $qids{$q}{$slot_name} > 1 && defined $nils{$q}{$slot_name}) { &error("More than one response given for slot $slot_name for query $q, where NIL has been given as an answer"); } } } print ERRLOG "Finished processing $results_file\n"; close ERRLOG || die "Close failed for error log $errlog: $!\n"; if ($num_errors) { exit 255; } exit 0; # print error message, keeping track of total number of errors sub error { my $msg_string = pop(@_); print ERRLOG "$0 of $results_file: Error on line $line_num --- $msg_string\n"; $num_errors++; if ($num_errors > $MAX_ERRORS) { print ERRLOG "$0 of $results_file: Quit. Too many errors!\n"; close ERRLOG || die "Close failed for error log $errlog: $!\n"; exit 255; } } # Returns index of string element in array if present, else -1 sub is_member { my($element,$arrayref) = @_; my $i; for ($i=0; $i<= $#{$arrayref}; $i++) { if ($element eq $$arrayref[$i]) { return($i); } } return(-1); }