#!/usr/bin/perl -w
use strict;
######################################################################
# Artemis Marfez Reverse Engineer, (C) 2015-2024 Nosey Nick Waterman,
#   <perl@noseynick.org>   https://noseynick.org/artemis/
# All wrong righted, all rights reserved. Licensed under the GNU
# Affero General Public License v3.0 https://www.gnu.org/licenses/agpl.txt
# with Commons Clause https://commonsclause.com/ v1.0
######################################################################
# Given a load of Artemis hex on stdin (deadbeef headers stripped,
# payloads only). This prog will analyse the packets and try to
# reverse-engineer their formats. It will also look in more detail
# at the 0x80803df9 packets for different object types, and attempt
# to discover the field lengths.
######################################################################
# This program is named in honour of Zefram Cochrane, inventor
# of the Earth's first Warp Drive. It seemed only fitting to
# reverse Zefram's name for our reverse-enngineering tool   :-)
######################################################################
use Data::Dumper; # only for Data::Dumper::qquote?
use bytes; # OK there's SOME UTF16 kicking around, but honestly...

# Assume the following types have the following length bitmasks
# (only needed if this prog can't determine them from observations):
my %assume = (
    # 0x06 => 2, # observed bf20 should PROBABLY be ff30
);

$| = 1; # output autoflush
my (%bits, %minid, %maxid, %dat, %pkts, $n);
while (<>) {
    print "#"                unless $n++ % 1000;
    print " $n packets...\n" unless $n   % 100000;
    
    # Strip channel numbers. Also the DEADBEEF header
    # if you didn't read the instructions above   :-p
    s/^(?:[CS]\d*\s*)?(?:efbeadde................................)?//;
    
    next unless /^([0-9a-f]{8,})/i; # ignore non-hex
    my ($pkttype, $payload) = unpack('V a*', pack('H*', $1));
    next unless defined $pkttype;
    
    if ($pkttype == 0x80803df9) { objectBitStream($payload) }
    else                        { packet($pkttype, $payload) }
}

sub packet {
    my ($pkttype, $payload) = @_;
    # +++++++ definitely need to be able to handle packets with SUBTYPES but...
    push @{$pkts{sprintf('0x%08x', $pkttype)}}, $payload;
}

sub objectBitStream {
    my $payload = shift;
    my ($type, $id, $bits) = unpack("C V a*", $payload);
    return unless $type;
    
    # (We think) we have a good objectBitStream packet
    my $terminator = substr($payload, -4, 4, '');
    unless ($terminator eq "\0\0\0\0") {
	$payload .= $terminator;
	print "Terminator wasn't \\0\\0\\0\\0 in... ",
	  unpack('H*', $payload), "\n";
    }
    
    push @{$dat{$type}}, $payload;
    $bits{$type} |= substr($bits,0,20);
    $minid{$type} = $id if $id < ($minid{$type} // 9999999999);
    $maxid{$type} = $id if $id > ($maxid{$type} // 0);
}

######################################################################

print " $n packets scanned\n";

print "# Object analysis:\n" if %bits;
my ($min, $max);
for my $type (sort {$a <=> $b} keys %bits) {
    my $tmin = $minid{$type}; $min = $tmin if $tmin < ($min // 9999999999);
    my $tmax = $maxid{$type}; $max = $tmax if $tmax > ($max // 0);
    my $mask = $bits{$type};
    my ($bits, $bytes, $unsure, $bitpat);
    while ($mask =~ s/^\xff//) { $bits += 8; $bytes++; $bitpat .= '.'; }
    if    ($mask =~  /^\x7f/ ) { $bits += 7; $bytes++; $bitpat .= '[\0-\x7f]'; }
    elsif ($mask =~  /^\x3f/ ) { $bits += 6; $bytes++; $bitpat .= '[\0-\x3f]'; }
    elsif ($mask =~  /^\x1f/ ) { $bits += 5; $bytes++; $bitpat .= '[\0-\x1f]'; }
    elsif ($mask =~  /^\x0f/ ) { $bits += 4; $bytes++; $bitpat .= '[\0-\x0f]'; }
    elsif ($mask =~  /^\x07/ ) { $bits += 3; $bytes++; $bitpat .= '[\0-\7]'; }
    elsif ($mask =~  /^\x03/ ) { $bits += 2; $bytes++; $bitpat .= '[\0-\3]'; }
    elsif ($mask =~  /^\x01/ ) { $bits += 1; $bytes++; $bitpat .= '[\0\1]'; }
    elsif ($mask =~  /^\x00/ ) {           $bytes++; $bitpat .= '\0'; }
    else {
	$bytes++;
	$bytes = $assume{$type} || $bytes;
	printf "#### ASSUMING %d bits (%d byte%s) due to poor mask %s:\n",
	  $bytes*8, $bytes, $bytes > 1 ? 's' : '',
	  unpack('H*', $bits{$type});
	$bits = 8*$bytes;
	$bitpat = '.' x $bytes;
    }
    
    my @packets = sort {
	(length($a) <=> length($b)) || ($a cmp $b)
    } @{ delete $dat{$type}};
    printf "# ObjType 0x%02x IDs %d..%d %s-bit field (%d byte%s) (%d packets)\n",
      $type, $tmin, $tmax, $bits, $bytes,
      $bytes > 1 ? 's' : '',
      scalar(@packets);
    
    my $typat = sprintf('\x%02x', $type);
    my $idpat;
    $tmax += 10; # to catch a few extra IDs that only appear later ++++++++++ MAY NOT BE ENOUGH
    for (0 .. 3) {
	# ++++ "Could do better" by collecting ACTUAL values for bytes 0..3?
	if      ($tmin == $tmax) {
	    $idpat .= sprintf '\x%02x', $tmin & 255;
	} elsif (($tmin | 255) == ($tmax | 255)) {
	    # $idpat .= sprintf '[\x%02x-\x%02x]', $tmin & 255, $tmax & 255;
	    $idpat .= '.'; # ++++++++++ $tmax += 10 might have been NOWHERE NEAR enough?
	} else {
	    $idpat .= '.';
	}
	$tmin = $tmin >> 8;
	$tmax = $tmax >> 8;
    }

    my %obj;
    print "  ### Matching ^f93d8080 ($typat)($idpat)($bitpat):\n";
    for my $packet (@packets) {
	my ($first, $ty, $id, $bits, $data, @rest) =
	  split /($typat)($idpat)($bitpat)/s, $packet;
	print "   NON-EMPTY ", unpack('H*', $first.$ty.$id.$bits.$data), "\n"
	  if $first;
	while (defined $bits) {
	    push @{ $obj{$bits} }, $data;
	    ($ty, $id, $bits, $data, @rest) = @rest;
	}
    }
    
    for my $mask (
	sort {
	    (length(mask2bits($a)) <=> length(mask2bits($b)))
	      || (mask2bits($a) cmp mask2bits($b))
	} keys %obj
    ) {
	my $bits = mask2bits($mask);
	printf "  Mask %s: (bits %s):\n", unpack('H*', $mask), $bits;
	analyse(@{ $obj{$mask} });
    }
}

print "# ID range $min .. $max\n" if defined($min) && defined($max);

if (%pkts) { for my $type (sort keys %pkts) {
    print "# Packet type $type:\n";
    analyse(@{ $pkts{$type} });
}}

sub analyse {
    my @lens;
    for (@_) { $lens[length $_]++; }
    print"    lengths:";
    for (0 .. scalar(@lens)-1) {
	print " ${_}B(x", $lens[$_], ")"  if $lens[$_];
    }
    print "\n";
    
    # Look for possible strings, floats, u8, u16, u32:
    my @poss;
    for my $one (@_) {
	my $off = -1; my $adjoff = -1; 
	while (++$off < length($one)) {
	    $adjoff++;
	    $poss[$adjoff] //= {};
	    my $is = $poss[$adjoff];
	    canbe($is, 'u8',  unpack('C',  substr($one, $off)));
	    canbe($is, 'u16', unpack('v',  substr($one, $off)));
	    my $u32         = unpack('V',  substr($one, $off)) // next;
	    my $float       = unpack('f<', substr($one, $off));
	    if      ($u32 == 0) { # zero float OR zero u32
		canbe($is, 'float', $float);
		canbe($is, 'u32',   $u32);
		next; # don't try to check for 0-len str
	    } elsif ($u32 == 0x80000000) { # -0 float?
		canbe($is, 'float', $float);
		canbe($is, 'u32',   $u32);
		next; # not gonna be a string
	    } elsif (($u32 & 0x70000000)==0x30000000   # small float 0.X
		or   ($u32 & 0x78000000)==0x40000000 ) # big float XXXX.XXX
	    { 
		canbe($is, 'float', $float); # doubly likely
		# COULD be but less likely:
		for (-3 .. 3) {
		    next unless $adjoff+$_ >= 0;
		    $poss[$adjoff+$_]->{not_float}+=.5 if $_;		    
		    $poss[$adjoff+$_]->{not_u32}++;
		    $poss[$adjoff+$_]->{not_u16}++     if $_ > -2;
		    $poss[$adjoff+$_]->{not_u8}++      if $_ > -1;;
		}
		next; # not gonna be a string
	    }
	    $is->{not_float}++; # or stupidly tiny one?
	    canbe($is, 'u32',   $u32);
	    # How about the possibility of strings? Check u32 length:
	    next unless ($off + 4 + $u32 + $u32) <= length($one);
	    my @str = unpack('V/v', substr($one, $off));
	    # does this look like a vlid string? ...
	    next unless scalar(@str) == $u32;  # ,,, which we can parse?
	    next if pop(@str);                 # ... with terminating zero?
	    next if grep {$_ >= 127} @str;     # all ASCII?
	    next unless grep {$_ > 31} @str;   # SOME printable? not 0s/1s?
	    # OK, we SEEM to have a viable string. Store it, and adjust
	    # offset for future floats/strings. The adjusted offset
	    # attempts to remove PREVIOUS strings, in the hope that they
	    # will all start in the same place, not influenced by
	    # length of strings earlier in the packet.
	    $is->{'str'}++;
	    $is->{Data::Dumper::qquote(pack('U*', @str))}++;
	    # COULD be but less likely:
	    for (-3 .. 3) {
		next unless $adjoff+$_ >= 0;
		$poss[$adjoff+$_]->{not_float}++ if $_;
		$poss[$adjoff+$_]->{not_u32}++;
		$poss[$adjoff+$_]->{not_u16}++   if $_ > -2;
		$poss[$adjoff+$_]->{not_u8}++    if $_ > -1;
	    }
	    $off += $u32 + $u32; # but NOT $adjoff
	}
    }
    for (0 .. scalar(@poss)-1) {
	my $is = $poss[$_];
	print "    payload[$_]:";
	for my $type (qw(u8 u16 u32 float str)) {
	    my $count = (delete $is->{$type}       // 0);
	    my $not   = (delete $is->{"not_$type"} // 0); $count += -$not;
	    my $min   =  delete $is->{$type.'_min'};
	    my $max   =  delete $is->{$type.'_max'};
	    next unless $count > 0;
	    print " $type(";
	    print "x$count" if $count > 1;
	    print " " if ($count > 1) && defined($min) && defined($max);
	    print "$min..$max" if defined($min) && defined($max);
	    print ")";
	}
	for my $str (sort keys %$is) {
	    print " ${str} (x", $is->{$str}, ")";
	}
	print "\n";
    }
}

sub canbe {
    my ($is, $type, $val, $add) = @_;
    return unless defined $val;
    $is->{$type}         += $add // 1;
    $is->{$type.'_min'} //= $val;
    $is->{$type.'_min'}   = $val if $val < $is->{$type.'_min'};
    $is->{$type.'_max'} //= $val;
    $is->{$type.'_max'}   = $val if $val > $is->{$type.'_max'};
}

sub mask2bits {
    my (@ret, $byte);
    for my $b (unpack('C*', shift)) {
	$byte++;
	push @ret, "$byte.1" if $b &   1;
	push @ret, "$byte.2" if $b &   2;
	push @ret, "$byte.3" if $b &   4;
	push @ret, "$byte.4" if $b &   8;
	push @ret, "$byte.5" if $b &  16;
	push @ret, "$byte.6" if $b &  32;
	push @ret, "$byte.7" if $b &  64;
	push @ret, "$byte.8" if $b & 128;
    }
    return "@ret";
}