#!/usr/bin/perl -w

#
# dbcoluniqcount.pm
# Copyright (C) 1997-2025 by John Heidemann <johnh@isi.edu>
#
# This program is distributed under terms of the GNU general
# public license, version 2.  See the file COPYING
# in $dblibdir for details.
#

package Fsdb::Filter::dbcoluniqcount;

=head1 NAME

dbcoluniqcount - count rows with identicial fields

=head1 SYNOPSIS

dbrowuniq [-FLB] [uniquifying fields...]

=head1 DESCRIPTION

Count number rows grouped by each value of field (or fields).
Equivalent to the Fsdb L<dbrowuniq> command with C<-c>,
but does not require the lines to be adjacent.

Unlike L<dbrowuniq>, we do I<not> require rows to be adajcent.
By default we the number of unique rows is small and we cache them in memory,
running quickly (O(n) time).
Optionally in the future we will give up and fall back on external sorting,
but this feature is not yet implemented.

As with L<dbroquniq>,
by default, L<dbcoluniqcount> outputs the I<first> unique row.
Optionally, with C<-L>, it will output the I<last> unique row,
or with C<-B> it outputs both first and last.
(This choice only matters when uniqueness is determined by specific fields.)

Incremental counting, when the C<count> column already exists,
is possible with C<-I>.
With incremental counting, the existing count column is summed.

=head1 OPTIONS

=over 4

=item B<-c> or B<--count>

Create a new column (count) which counts the number of times
each line occurred.

The new column is named by the C<-N> argument, defaulting to C<count>.

=item B<-N> on B<--new-name>

Specify the name of the count column, if any.
Please specify the type with the name, if desired
(allowing one to pick sizes smaller than the default quad, if desired).
(Default is C<count:q>.)

=item B<-I> on B<--incremental>

Incremental counting.
If the count column exists, it is assumed to have a partial count
and the count accumulates.
If the count column doesn't exist, it is created.

=item B<-L> or B<--last>

Output the last unique row only.
By default, it outputs the first unique row.

=item B<-F> or B<--first>

Output the first unique row only. 
(This output is the default.)

=item B<-B> or B<--both>

Output both the first and last unique rows. 

=back

=for comment
begin_standard_fsdb_options

This module also supports the standard fsdb options:

=over 4

=item B<-d>

Enable debugging output.

=item B<-i> or B<--input> InputSource

Read from InputSource, typically a file name, or C<-> for standard input,
or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.

=item B<-o> or B<--output> OutputDestination

Write to OutputDestination, typically a file name, or C<-> for standard output,
or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.

=item B<--autorun> or B<--noautorun>

By default, programs process automatically,
but Fsdb::Filter objects in Perl do not run until you invoke
the run() method.
The C<--(no)autorun> option controls that behavior within Perl.

=item B<--header> H

Use H as the full Fsdb header, rather than reading a header from
then input.

=item B<--help>

Show help.

=item B<--man>

Show full manual.

=back

=for comment
end_standard_fsdb_options


=head1 SAMPLE USAGE

=head2 Input:

    #fsdb      event
    _null_getpage+128
    _null_getpage+128
    _null_getpage+128
    _null_getpage+4
    _null_getpage+4
    _null_getpage+4
    _null_getpage+128
    _null_getpage+128
    _null_getpage+128
    _null_getpage+4
    _null_getpage+4
    _null_getpage+4
    #  | /home/johnh/BIN/DB/dbcol event
    #  | /home/johnh/BIN/DB/dbsort event

=head2 Command:

    cat data.fsdb | dbrowuniq -c

=head2 Output:

    #fsdb	event	count
    _null_getpage+128	6
    _null_getpage+4	6
    #	2	/home/johnh/BIN/DB/dbcol	event
    #  | /home/johnh/BIN/DB/dbcoluniqcount

=head1 SAMPLE USAGE 2

Retaining the last unique row as an example.

=head2 Input:

	#fsdb event i
	_null_getpage+128 10
	_null_getpage+128 11
	_null_getpage+128 12
	_null_getpage+4 16
	_null_getpage+4 17
	_null_getpage+4 18
	_null_getpage+128 13
	_null_getpage+128 14
	_null_getpage+128 15
	_null_getpage+4 19
	_null_getpage+4 20
	_null_getpage+4 21
	#  | /home/johnh/BIN/DB/dbcol event
	#  | /home/johnh/BIN/DB/dbsort event

=head2 Command:

    cat data.fsdb | dbrowuniq -c -L event

=head2 Output:

	#fsdb event i count
	_null_getpage+128	15	6
	_null_getpage+4	21	6
	#  | /home/johnh/BIN/DB/dbcol event
	#  | /home/johnh/BIN/DB/dbsort event
	#   | dbcoluniqcount -L event


=head1 SEE ALSO

L<Fsdb>, L<dbrowuniq>.


=head1 CLASS FUNCTIONS

=cut

@ISA = qw(Fsdb::Filter);
$VERSION = 2.0;

use strict;
use Pod::Usage;
use Carp;

use Fsdb::Filter;
use Fsdb::IO::Reader;
use Fsdb::IO::Writer;


=head2 new

    $filter = new Fsdb::Filter::dbrowuniq(@arguments);

Create a new dbrowuniq object, taking command-line arguments.

=cut

sub new ($@) {
    my $class = shift @_;
    my $self = $class->SUPER::new(@_);
    bless $self, $class;
    $self->set_defaults;
    $self->parse_options(@_);
    $self->SUPER::post_new();
    return $self;
}


=head2 set_defaults

    $filter->set_defaults();

Internal: set up defaults.

=cut

sub set_defaults ($) {
    my($self) = @_;
    $self->SUPER::set_defaults();
    $self->{_which} = 'F';
    $self->{_incremental} = undef;
    $self->{_uniquifying_cols} = [];
    $self->{_destination_column} = 'count:q';
    $self->{_header} = undef;
}

=head2 parse_options

    $filter->parse_options(@ARGV);

Internal: parse command-line arguments.

=cut

sub parse_options ($@) {
    my $self = shift @_;

    my(@argv) = @_;
    $self->get_options(
	\@argv,
 	'help|?' => sub { pod2usage(1); },
	'man' => sub { pod2usage(-verbose => 2); },
	'autorun!' => \$self->{_autorun},
	'B|both' => sub { $self->{_which} = 'B' },
	'F|first|nolast' => sub { $self->{_which} = 'F' },
	'header=s' => \$self->{_header},
	'I|incremental!' => \$self->{_incremental},
	'L|last' => sub { $self->{_which} = 'L' },
	'close!' => \$self->{_close},
	'd|debug+' => \$self->{_debug},
	'i|input=s' => sub { $self->parse_io_option('input', @_); },
	'log!' => \$self->{_logprog},
	'N|new-name=s' => \$self->{_destination_column},
	'o|output=s' => sub { $self->parse_io_option('output', @_); },
	) or pod2usage(2);
    push (@{$self->{_uniquifying_cols}}, @argv);
}

=head2 setup

    $filter->setup();

Internal: setup, parse headers.

=cut

sub setup ($) {
    my($self) = @_;

    my(@finish_args) = (-comment_handler => $self->create_delay_comments_sub);
    push (@finish_args, -header => $self->{_header}) if (defined($self->{_header}));
    $self->finish_io_option('input', @finish_args);

    if ($#{$self->{_uniquifying_cols}} == -1) {
        my($destination_column_name) = $self->{_in}->colspec_to_name_type_spec($self->{_destination_column});
	foreach (@{$self->{_in}->cols()}) {
	    push (@{$self->{_uniquifying_cols}}, $_)
		if ($_ ne $destination_column_name);
	};
    } else {
	foreach (@{$self->{_uniquifying_cols}}) {
	    croak($self->{_prog} . ": unknown column ``$_''.\n")
		if (!defined($self->{_in}->colspec_to_i($_)));
	};
    };

    $self->finish_io_option('output', -clone => $self->{_in}, -outputheader => 'delay');
    if ($self->{_out}->colspec_to_i($self->{_destination_column})) {
	if (!$self->{_incremental}) {
            croak($self->{_prog} . ": cannot create column " . $self->{_destination_column} . " (it already exists)\n");
	};
    } else {
	$self->{_out}->col_create($self->{_destination_column})
	    or croak($self->{_prog} . ": cannot create column " . $self->{_destination_column} . " (maybe it already existed?)\n");
	$self->{_incremental} = undef;   
    };
}

=head2 run

    $filter->run();

Internal: run over each rows.

=cut
sub run ($) {
    my($self) = @_;

    my $read_fastpath_sub = $self->{_in}->fastpath_sub();
    my $write_fastpath_sub = $self->{_out}->fastpath_sub();
    my $count_coli = undef;
    $count_coli = $self->{_out}->colspec_to_i($self->{_destination_column});
    die("internal error: cannot find count_coli for " . $self->{_destination_column} . "\n")
        if (!defined($count_coli));
    
    my $output_fref = [];
    my $this_fref;
    my %saved_first_fref;
    my %saved_last_fref;
    my %count;
    my @order;

    my $generate_uniq_code = '$uniq = ""; ';
    foreach (@{$self->{_uniquifying_cols}}) {
	my $coli = $self->{_in}->colspec_to_i($_);
	croak($self->{_prog} . ": internal error, cannot find column $_ even after checking already.\n")
	    if (!defined($coli));
	$generate_uniq_code .= '$uniq .= $this_fref->[' . $coli . ' ] . "\t";';
    };
    print $generate_uniq_code if ($self->{_debug});

    my $count_increment_code = ($self->{_incremental} ? '$this_fref->[' . $count_coli . ']' : '1');

    my $remember_first_occurence_code = q'@{$saved_first_fref{$uniq}} = @{$this_fref};';
    my $remember_last_occurence_code = q'@{$saved_last_fref{$uniq}} = @{$this_fref};';
    $remember_last_occurence_code = '' if ($self->{_which} eq 'F'); # optimize

    my $input_delay_comments = $self->{_delay_comments}[0];
    my $delay_comments_out = $self->{_out};
    my $delay_comments_flush_code = $input_delay_comments ? '$input_delay_comments->flush($delay_comments_out);' : "";
    #
    # loop over input
    #
    my $input_loop_code = q'
        my $uniq;
	while ($this_fref = &$read_fastpath_sub()) {
            ' . $delay_comments_flush_code . q'
            ' . $generate_uniq_code . q'
	    if (!defined($count{$uniq})) {
                # first
                push(@order, $uniq);
                $count{$uniq} = ' . $count_increment_code . ";\n\t" .
                $remember_first_occurence_code . "\n\t" .
                $remember_last_occurence_code . q'
            } else {
                # subsequent
                $count{$uniq} += ' . $count_increment_code . q'; ' .
                $remember_last_occurence_code . '
            };
        };' . "\n";
    eval $input_loop_code;
    $@ && croak($self->{_prog} . ": internal eval error: $@\n");

    #
    # output results
    #
    my $remember_count_code = '$output_fref->[' . $count_coli . '] = $count{$uniq};' . "\n";
    my $output_first_occurence_code = '';
    if ($self->{_which} eq 'F' || $self->{_which} eq 'B') {
	$output_first_occurence_code .= q'
	    @{$output_fref} = @{$saved_first_fref{$uniq}};' .
	    $remember_count_code . q'
	    &$write_fastpath_sub($output_fref);' . "\n";
    };
    my $output_last_occurence_code = '';
    if ($self->{_which} eq 'L' || $self->{_which} eq 'B') {
	$output_last_occurence_code .= '
	    @{$output_fref} = @{$saved_last_fref{$uniq}};' .
	    $remember_count_code . q'
	    &$write_fastpath_sub($output_fref);' . "\n";
    };

    my $output_loop_code = q'
        for my $uniq (@order) {' .
            $output_first_occurence_code .
            $output_last_occurence_code .
        q'};
    ';
    eval $output_loop_code;
    $@ && croak($self->{_prog} . ": internal eval error: $@\n");
    
};


=head1 AUTHOR and COPYRIGHT

Copyright (C) 1997-2025 by John Heidemann <johnh@isi.edu>

This program is distributed under terms of the GNU general
public license, version 2.  See the file COPYING
with the distribution for details.

=cut

1;
