#!/usr/bin/perl
#
# Copyright (c) Michel Klein 2006
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
# USA
#
# Any questions relating to this software should be directed to:
# michel.klein@cs.vu.nl
#
use Text::BibTeX qw(:macrosubs :metatypes);
use Digest::SHA1 qw(sha1 sha1_hex);
use LWP::Simple;
use CGI;
my $home = "http://www.cs.vu.nl/~mcaklein/bib2rdf/";
add_macro_text(jan, "January");
add_macro_text(feb, "February");
add_macro_text(mar, "March");
add_macro_text(apr, "April");
add_macro_text(may, "May");
add_macro_text(jun, "June");
add_macro_text(jul, "July");
add_macro_text(aug, "August");
add_macro_text(sep, "September");
add_macro_text(oct, "October");
add_macro_text(nov, "November");
add_macro_text(dec, "December");
%tags = ("inbook", "Inbook",
"incollection", "InCollection",
"conference", "InProceedings",
"inproceedings", "InProceedings",
"mastersthesis", "MasterThesis",
"phdthesis", "PhDThesis",
"techreport", "TechnicalReport",
"unpublished", "Unpublished");
$query = new CGI;
$url = $query->param('url');
$abs = (!($query->param('relative') eq 'on'));
$label = (!($query->param('nolabel') eq 'on'));
if ($url eq "") {
print $query->redirect($home);
exit;
}
$fn="tmp/".getppid.".bib";
@lines = get($url);
if (!(@lines)) {
print "Content-type: text/html\n\n";
print "
Error
Cannot find $url.";
exit;
}
open(BIB, ">$fn");
foreach $line (@lines) {
$line =~ s/\r//g;
$line =~ s/and$/and /g;
print BIB $line;
}
close(BIB);
print "Content-type: text/xml; charset=utf-8\n\n";
print <
EOH
# Read bibtex from file
$bibfile = new Text::BibTeX::File $fn;
while ($entry = new Text::BibTeX::Entry $bibfile)
{
next unless $entry->parse_ok;
next unless $entry->metatype == BTE_REGULAR;
@fields = $entry->fieldlist();
$type = &field2tag($entry->type());
$key = $entry->key();
$key =~ s/:/_/g;
if ($abs) {
print "<$type rdf:about=\"$url#$key\">\n";
} else {
print "<$type rdf:ID=\"$key\">\n";
}
print " \n";
undef @authors;
undef @ids;
undef $title;
undef $journal;
undef $volume;
undef $number;
undef $booktitle;
undef $publisher;
undef $month;
undef $year;
for $field (@fields) {
$value = $entry->get($field);
if ($value ne "") {
if (!(($field eq 'url') ||
($field eq 'howpublished') ||
($field eq 'note'))) {
$value = &deTex($value);
}
# Clean-up tex in url-field
if ($field eq 'url') {
$value =~ s/\\~{}/~/g;
$value =~ s/\"//g;
if ($value =~ /\\url/) {
$value =~ s/\\url{//g;
$value =~ s/}$//g;
}
}
# Cleanup month-field
if ($field eq 'month') {
# Fix for bug in Text::BibTex which leaves October out
if ($value !~ /[a-zA-Z]/) {
$value = "October $value";
} else {
# Add space between month and day
$value =~ s/(\D+)(\d.*)/$1 $2/g;
}
}
# Encode XML specific characters
$value = &encodeXML($value);
# Turn \url{} into HTML url's for abstracts and notes
if (($field eq 'note') || ($field eq 'abstract')) {
if ($value =~ /\\url{.*}/ ) {
$value =~ s/\\url{(.*)}/$1<\/a>/g;
}
}
# Read values
if ($label) {
if ($field eq 'title') { $title = $value; }
if ($field eq 'journal') { $journal = $value; }
if ($field eq 'volume') { $volume = $value; }
if ($field eq 'number') { $number = $value; }
if ($field eq 'booktitle') { $booktitle = $value; }
if ($field eq 'publisher') { $publisher = $value; }
if ($field eq 'month') { $month = $value; }
if ($field eq 'year') { $year = $value; }
}
# Parse indivudual names in author and editor fields
if (($field eq 'author') || ($field eq 'editor')) {
@names = $entry->split($field);
if ($field eq 'author') { @authors = @names; }
# Print individual author/editor ID's and store their ID and fullname
for $name (@names) {
$name =~ s/(^\s+|\s+$)//;
$name = &deTex($name);
$id = "#".&storeName('person', $name);
@ids = (@ids, $id);
if ($abs) {$id = "$url$id";}
print " \n";
}
# Print ordered list of author/editor ID's
$listid = sha1_hex($url.$field.$key);
print " \n";
print " \n";
for $id (@ids) {
if ($abs) {$id = "$url$id";}
print " \n";
}
print " \n";
print " \n";
undef @ids;
} elsif (($field eq 'organization') || ($field eq 'publisher')
|| ($field eq 'institution')) {
$id = "#".&storeName('organization', $value);
if ($abs) {$id = "$url$id";}
print " \n";
} elsif (($field eq 'abstract') || ($field eq 'note')) {
print " $value\n";
} elsif ($field eq 'url') {
print " \n";
} elsif ($field eq 'howpublished') {
if ($value =~ /\\url{(.*)}/ ) {
print " \n";
} else {
$value = &deTex($value);
print " $value\n";
}
} else {
print " $value\n";
}
}
}
if ($label) { &printLabel; }
print "$type>\n\n";
}
# Print RDF descriptions for all persons
while (($id, $name) = each(%persons)) {
if ($abs) {
print "\n";
} else {
print "\n";
}
print " \n";
#if ($label) { print " $name\n"; }
if ( ($name =~ /,.+,/) ||
($name =~ /;/) ) {
$name = "CONCATENATED-AUTHOR-NAMES"; # ($name)";
}
print " $name\n\n\n";
}
# Print RDF descriptions for all organisations
while (($id, $name) = each(%organizations)) {
if ($abs) {
print "\n";
} else {
print "\n";
}
#if ($label) { print " $name\n"; }
print " $name\n\n\n";
}
print "\n";
unlink($fn);
# Subroutine to make all data PCDATA
sub encodeXML {
$value = $_[0];
#XML encodings
$value =~ s/&/&/g;
$value =~ s/</g;
$value =~ s/>/>/g;
# convert to UTF
$value =~ s/([^\x20-\x7F])/'' . ord($1) . ';'/gse;
# remove allready encoded UTF (<32)
$value =~ s/[0-31];//gse;
$value;
}
# Subroutine to remove TeX specific characters
sub deTex {
$value = $_[0];
$value =~ s/~/ /g;
$value =~ s/(\\'|\\"|\\|{|}|,$)//g;
$value =~ s//g;
$value =~ s/>//g;
$value =~ s/--/-/g;
$value =~ s/(``|'')/"/g;
$value;
}
# Subroutine to create stack of persons / indivuduals;
# returns a string that can be used as identifier in RDF
sub storeName {
$name = $_[1];
$mkey = $name;
$name = &encodeXML($name);
$mkey =~ s/&/and/g;
$mkey =~ s/\s+/_/g;
$mkey =~ s/(:|\/)/_/g;
$mkey =~ s/(;|&|,|\.|\#)//g;
$mkey =~ tr/A-Z/a-z/;
$mkey =~ s/([^\x20-\x7F])//gse;
if ($_[0] eq 'person') {
%persons = (%persons, $mkey, $name);
} elsif ($_[0] eq 'organization') {
%organizations = (%organizations, $mkey, $name);
}
$mkey;
}
# Subroutine to translate unknown fields in the bibtex-file to
# properties of publications
sub field2tag {
# this should be generalized to allow arbitrary translations
$field = $tags{$_[0]};
if ($field eq "") {
$field = "\u$_[0]";
}
"ow:$field";
}
# Subroutine to create an RDF label with a readable summary of
# the bibtex entry
sub printLabel {
undef $label;
if (@authors) {
for $name (@authors) {
$name = &deTex($name);
$name = &encodeXML($name);
$label = $label . $name . ', ';
}
$label =~ s/, $/. /;
}
$label = $label . $title if ($title);
$label = $label . ". $journal" if ($journal);
if ($volume) {
$label = $label . " $volume";
$label = $label . "($number)" if ($number);
}
$label = $label . ". In: $booktitle" if ($booktitle);
$label = $label . ", number $number" if ($number && !($volume));
$label = $label . ", $publisher" if ($publisher);
$label = $label . ", $month" if ($month);
$label = $label . ", $year" if ($year);
print " $label\n";
}