Listing of atfsplit2.plx

#!/usr/bin/perl
use warnings; use strict; use utf8;

use Getopt::Long;
use Pod::Usage;

my $base = '';
my $dryrun = 0;
my $install = 0;
my $update = 0;
my $verbose = 0;
my $list = '';
my $stdin = 0;
my $cat = 0;
my $except = 0;
my %print = ();
my $mydir = '';
my $shallow = 0;
my $show_updates = 0;
my @texts = ();
my $use_lexical = 0;

GetOptions('base:s'=>\$base,
           'cat+'=>\$cat,
           'dir:s'=>\$mydir,
           'dryrun+'=>\$dryrun,
           'except'=>\$except,
           'install'=>\$install,
           'list:s'=>\$list,
           'shallow'=>\$shallow,
           'show-updates'=>\$show_updates,
           'text:s'=>\@texts,
           'update'=>\$update,
           'verbose'=>\$verbose,
           ) or pod2usage(1);

#################################################################

my $counter = 0;
my $P = '';
my %P = ();
my $printing = 0;
my $dir = $mydir || 'D000';
my @preamble = ();
my $fname = '';
my $current_type = '';
my $old_atf = '';
my $new_atf = '';
my @updates = ();

if ($list) {
    open(IN,$list) || die "can't open list file $list\n";
    while (<IN>) {
        foreach my $n (split(/\s+/)) {
            ++$print{$n};
        }
    }
    close(IN);
} elsif ($#texts >= 0) {
    $list = 'texts';
    foreach my $t (@texts) {
        ++$print{$t};
    }
}

if ($install && !length($base)) {
    $base = '/usr/local/share/cdl/texts';
}

while (<>) {
#    if (/^\@(?:composite|synopticon|transliteration)/) {
#       $current_type = $_;
#       next;
#    }
#    if (/^\#atf use lexical/) {
#       $use_lexical = 1;
#       next;
#    }
    s/^\x{ef}\x{bb}\x{bf}//; # remove BOMs
    if (/^&([PQX]\S+)/) {
        if ($P ne $1) {
            my $last_P = $P;
            $P = $1;
            close OUT unless $cat;
            if (!$dryrun && $install && $update && $old_atf) {
                my $old = slurp($old_atf);
                my $new = slurp($new_atf);
                if ($old ne $new) {
                    print STDERR "updating $old_atf\n" if $verbose;
                    open(OLD,">$old_atf");
                    print OLD $new;
                    close(OLD);
                    push @updates, $last_P;
                } else {
                    print STDERR "$old_atf is unchanged\n" if $verbose;
                }
                unlink $new;
            }
            select STDOUT;
            print "$fname\n" if $fname && $verbose && !$update;
            if ($list && !print_this_p($P)) {
                @preamble = ();
                $printing = 0;
                next;
            }
#           exit if $counter++ > 3;
            if ($install) {
                my $dname = $P;
                my $PQ = $P;
                $PQ =~ s/^([PQ]).*$/$1/;
#               $dname =~ s/...$/xxx/;
                $dname =~ s/...$//;
#               if ($shallow) {
#                   $dname = "$base/$P";
#               } else {
#                   $dname = "$base/$PQ/$dname/$P";
#               }
                $dname = "$base/$dname/$P";
                $fname = "$dname/$P.atf";
                if ($dryrun) {
                    print "$fname\n";
                }
                if (!$dryrun) {
                    system("mkdir -p $dname") unless -d $dname;
                    if ($update) {
                        $old_atf = $fname;
                        $new_atf = "$fname.new";
                        $fname = $new_atf;
                    }
                    open(OUT,">$fname") || die "atfsplit.plx: can't write '$fname'\n";
                    select OUT;
                    $printing = 1;
#                   print "#atf use lexical\n" if $use_lexical;
#                   print $current_type if $current_type;
#                   print OUT @preamble; @preamble = ();
                    print OUT;
                    chown 48, 48, $fname;
                    chmod 0664, $fname;
                }
            } else {
                my $samedir = (++$counter % 1000);
                if (!$cat && !$samedir) {
                    ++$dir if $dir =~ /\d$/;
                    system "mkdir -p $dir" if $dir;
#                   print STDERR "$dir\r";
                }
#           print STDERR "$counter\r" unless 
                if (!defined $P{$P}) {
                    ++$P{$P};
                    $fname = "$P.atf";
                    $fname = "$dir/$fname" if $dir;
                    if ($dryrun) {
                        print "$fname\n";
                    }
                    if (!$dryrun) {
                        if (!$cat) {
                            if ($dir) {
                                system "mkdir -p $dir"
                                    || die "can't mkdir $dir\n";
                            }
                            open(OUT,">$fname")
                                || die "can't open $fname for write\n";
                            select OUT;
                        }
                        $printing = 1;
                        print @preamble; @preamble = ();
                        print;
                    }
                } else {
                    warn "atfsplit.plx: $P occurs more than once\n";
                    $printing = 0;
                }
            }
        }
    } elsif (/^&/) {
        warn "atfsplit.plx: bad &-line '$_'";
        $printing = 0;
    } else {
        print if $printing;
    }
}

if ($show_updates) {
    select STDOUT;
    print join("\n",@updates),"\n";
}

sub
print_this_p {
    my $P = shift;
    if ($except) {
        !$print{$P};
    } else {
        $print{$P};
    }
}

sub
slurp {
    my $fn = shift;
    local $/ = undef;
    open(IN,$fn); my $ret = <IN>; close(IN);
    $ret;
}

#################################################################

1;

__END__

=head1 NAME

atfsplit.plx -- split up ATF files into their constituent PQ-files

=head1 SYNOPSIS

atfsplit.plx [options] [file]

=head1 OPTIONS

=over

=item B<-base dir>

Use dir as the base into which to split the files.  By default, this is D000, D001, etc.

=item B<-cat>

Spool the output straight onto STDOUT like unix cat does.  Use with -list to
extract a sub-corpus from a bigger file.

=item B<-dir>

Create the files in 'dir', which is appended to 'base' if given.  If you want
to split the files into the current directory with no subdirectories use
'-dir .'.  If the dir name ends in a digit, it is incremented every thousand
files (similar to the default behaviour with the dir name D000, D001, D002
etc.).

=item B<-dryrun>

Just print the names of the files which would be generated; don't create any files.

=item B<-except>

Use with -list; output everything except the texts given in the list.

=item B<-install>

Install the individual PQ-files into the cdl/texts tree.

=item B<-list filename>

Read a list of P/Q IDs from filename and output only those texts.

=item B<-shallow>

When building pathnames do not include mid-level directories of the form P/P000xxx,
Q/Q100xxx etc.

=item B<-show-updates>

Produce a list of updated texts.

=item B<-update>

Only produce the ATF file for a text if the current version is different
from what is in the archive being split.

=item B<-verbose>

Print the names of files as they are generated.

=back

=head1 DESCRIPTION

atfsplit reads a file which may contain more than one transliteration
and splits it up into one file per transliteration.  The output is
grouped in directories containing at most 1000 files each, the
subdirectories being named D000, D001, etc.  With the -install option
the files are split directly into the cdl/texts tree.

=head1 AUTHOR

Steve Tinney (stinney@sas.upenn.edu)

=head1 COPYRIGHT

Copyright(c) Steve Tinney 2004.

Released under the Gnu General Public License
(http://www.gnu.org/copyleft/gpl.html).