/*
  bwdiff.c - Byte-wise Difference: diff two almost-identical files.

  This program is used to display the bytes changed between two files.	The
  bytes from the first (changed) file are displayed first, followed by the bytes
  from the second (original) file, in square brackets.	Bytes that are printable
  (ASCII) are displayed as characters enclosed in quotes (a quote itself is
  duplicated).

  Jason Hood, 7 September, 1998.
  Public Domain.

  981228: display the changed filename, or optionally the original filename;
	  if one or two bytes are identical within a difference, include them
	   in the diff list.
  000222: stop diffing if too different; display message if identical.
  000303: changed message if too different.
  001001: split difference across lines;
	  option to disable string translation.

  v1.00, 15 & 16 October, 2003:
    better handling of different sized files;
    display the count if diff is too big;
    surround the filename in quotes if it contains spaces.

  v1.01, 7 September, 2004:
    use a buffer to store the different bytes;
    increase default maximum to 88 bytes;
    line up the last line of a multi-line diff;
    require four printable characters before using a string;
    calculate the lengths of both files, use the smaller;
    modified return codes.

  v1.10, 25 to 27 October, 2012:
    options to select the bytes per line and the space between bytes;
    increased default maximum to 160 bytes;
    option to select the minimum string size;
    warn if the files have different lengths;
    display nothing if the files are identical;
    split a line at a string;
    more flexible option parsing;
    detect wide (two-byte little-endian) strings;
    opton to form two- to four-byte diffs as little-endian.
*/

#define PVERS "1.10"
#define PDATE "27 October, 2012"


#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

#ifdef __DJGPP__
void   __crt0_load_environment_file( char* dummy ) { }
char** __crt0_glob_function( char* dummy ) { return 0; }
#endif

#ifdef __MINGW32__
int _CRT_glob = 0;
#endif

// Put back the macro version of getc, which *greatly* improves performance
// when using /MD.
#if defined(_MSC_VER) && !defined(getc)
#define getc(_stream)     (--(_stream)->_cnt >= 0 \
                ? 0xff & *(_stream)->_ptr++ : _filbuf(_stream))
#endif


int string = 4; 		// Display strings from this many printables
int width;			// characters to display bytes
int space = 1;			// space between bytes
int little;			// use little-endian style for small diffs

int calculate_len( unsigned char* buf, int len );
int display( unsigned char* buf, int len );


enum
{
  E_SAME,			// Files are identical
  E_DIFF,			// Files are different
  E_TOO,			// Files are too different
  E_OPT,			// Unknown/invalid option
  E_MEM,			// Not enough memory for buffer
  E_NEW,			// Unable to open changed file
  E_OLD,			// Unable to open original file
};


#define STRING static const char
STRING zUnknownOpt[] = "Unknown option: %s\n";
STRING zTwoFiles[]   = "Two file names are required.\n";
STRING zNoMemory[]   = "Not enough memory.\n";
STRING zBadFile[]    = "%s: unable to open.\n";
STRING zWarnSize[]   = "Warning: files have different sizes.\n";
STRING zTooMuch[]    = "Difference is too great! (%d bytes found.)\n";


int main( int argc, char* argv[] )
{
  FILE* file1;			// Changed file
  FILE* file2;			// Original file
  long	offset; 		// Offset of difference
  int	byte1, byte2;		// The two bytes
  unsigned char* buf1;		// Changed bytes
  unsigned char* buf2;		// Original bytes
  unsigned char* disp1; 	// Changed bytes being displayed
  unsigned char* disp2; 	// Original bytes being displayed
  int	len;			// Length of difference
  int	cnt1, cnt2;		// Length of difference to display
  int	pad;			// Padding spaces for alignment
  int	maxlen = 160;		// Stop diffing if length exceeds this
  int	out = 1;		// Filename to display
  int	different = E_SAME;	// Were any differences found?

  if (argc == 1 || strcmp( argv[1], "/?" ) == 0 ||
		   strcmp( argv[1], "-?" ) == 0 ||
		   strcmp( argv[1], "--help" ) == 0)
  {
    puts(
    "Byte-wise Difference by Jason Hood <jadoxa@yahoo.com.au>.\n"
    "Version " PVERS " (" PDATE ").  Public Domain.\n"
    "http://misc.adoxa.cjb.net/\n"
    "\n"
    "Display the byte differences between two almost-identical files.\n"
    "\n"
    "bwdiff [-<n>bnos<n>w<n>x] <changed file> <original file>\n"
    "\n"
    "  -<n>   no more than n bytes should differ (default is 160)\n"
    "  -b     bytes only (no strings; same as -s0)\n"
    "  -n     no space between bytes\n"
    "  -o     use original file name, not changed\n"
    "  -s<n>  choose the minimum string length (default is 4)\n"
    "  -w<n>  use n bytes per line (default is 11, or 16 with -n)\n"
    "  -x     use little-endian for two- to four-byte diffs"
    );
    return 0;
  }
  if (strcmp( argv[1], "--version" ) == 0)
  {
    puts( "Byte-wise Difference version " PVERS " (" PDATE ")." );
    return 0;
  }

  while (argc > 1 && argv[1][0] == '-' && argv[1][1] != '\0')
  {
    ++argv[1];
    while (*argv[1])
    {
      switch (*argv[1])
      {
	case 'b': string = 0; ++argv[1]; break;
	case 'n': space  = 0; ++argv[1]; break;
	case 'o': out    = 2; ++argv[1]; break;
	case 's': string = (int)strtol( argv[1] + 1, &argv[1], 0 ); break;
	case 'w': width  = (int)strtol( argv[1] + 1, &argv[1], 0 ); break;
	case 'x': little = 1; ++argv[1]; break;
	default:
	{
	  char* opt = argv[1];
	  maxlen = (int)strtol( argv[1], &argv[1], 0 );
	  if (maxlen == 0)
	  {
	    fprintf( stderr, zUnknownOpt, opt );
	    return E_OPT;
	  }
	}
	break;
      }
    }
    ++argv; --argc;
  }
  if (argc < 3)
  {
    fputs( zTwoFiles, stderr );
    return E_OPT;
  }
  if (width <= 0)
    width = 32;
  else if (space)
    width = width * 3 - 1;
  else
    width *= 2;

  // Allocate an extra byte to assist wide string detection.
  if ((buf1 = malloc( (maxlen + 1) * 2 )) == NULL)
  {
    fputs( zNoMemory, stderr );
    return E_MEM;
  }
  buf2 = buf1 + maxlen + 1;

  if ((file1 = fopen( argv[1], "rb" )) == NULL)
  {
    fprintf( stderr, zBadFile, argv[1] );
    return E_NEW;
  }
  if ((file2 = fopen( argv[2], "rb" )) == NULL)
  {
    fprintf( stderr, zBadFile, argv[2] );
    return E_OLD;
  }

  fseek( file1, 0, SEEK_END );
  fseek( file2, 0, SEEK_END );
  if (ftell( file1 ) != ftell( file2 ))
    fputs( zWarnSize, stderr );
  rewind( file1 );
  rewind( file2 );

  for (;;)
  {
    do					// Skip identical bytes
    {
      byte1 = getc( file1 );
      byte2 = getc( file2 );
    } while (byte1 == byte2 && byte1 != EOF);
    if ((byte1 | byte2) == EOF)
      break;

    if (different == E_SAME)
    {
      fputs( "File: ", stdout );
      if (strchr( argv[out], ' ' ) == NULL)
	puts( argv[out] );
      else
	printf( "\"%s\"\n", argv[out] );
    }
    different = E_DIFF;

    offset = ftell( file1 ) - 1;	// Pointing at byte after first diff.
    printf( "%06lX: ", offset );

    len = 0;				// Count the different bytes
    for (;;)
    {
      ++len;
      byte1 = getc( file1 );
      byte2 = getc( file2 );
      if ((byte1 | byte2) == EOF)
	break;
      if (byte1 == byte2)
      {
	byte1 = getc( file1 );
	byte2 = getc( file2 );
	if ((byte1 | byte2) == EOF)
	  break;
	if (byte1 == byte2)
	{
	  byte1 = getc( file1 );
	  byte2 = getc( file2 );
	  if (byte1 == byte2 || (byte1 | byte2) == EOF)
	    break;
	  ++len;
	}
	++len;
      }
    }
    if (len > maxlen)
    {
      printf( zTooMuch, len );
      return E_TOO;
    }
    // Invalidate the extra byte, in case it's not really there.
    buf1[len] = buf2[len] = 0xFF;
    fseek( file1, offset, SEEK_SET );
    fseek( file2, offset, SEEK_SET );
    fread( buf1, len + 1, 1, file1 );
    fread( buf2, len + 1, 1, file2 );
    pad = 0;
    if (little && len >= 2 && len <= 4)
    {
      display( buf1, -len );
      putchar( '\t' );
      fputs( "[ ", stdout );
      display( buf2, -len );
      puts( " ]" );
    }
    else for (disp1 = buf1, disp2 = buf2; ; disp1 += cnt1, disp2 += cnt1)
    {
      cnt1 = calculate_len( disp1, len );
      cnt2 = calculate_len( disp2, len );
      if (cnt2 < cnt1)
	cnt1 = cnt2;
      cnt2 = display( disp1, cnt1 );	// Display changed bytes
      // If there's more than one line, always use maximum width.
      if (disp1 == buf1 && cnt1 != len && cnt2 < width)
	pad = width;
      if (pad - cnt2 > 0)
      {
	printf( "%*c", pad - cnt2, ' ' );
	cnt2 = pad;
      }
      if (cnt2 >= 40 - 8)
	putchar( ' ' ), putchar( ' ' );
      else
	putchar( '\t' );
      fputs( "[ ", stdout );
      display( disp2, cnt1 );		// Display original bytes
      puts( " ]" );
      len -= cnt1;
      if (len == 0)
	break;
      fputs( "        ", stdout );      // eight spaces ("offset: ")
      pad = cnt2;
    }
  }

  return different;
}


/*
  Determine if the bytes form a printable string.
*/
int is_string( unsigned char* buf, int len )
{
  if (len < string - 1) 		// already processed one char on entry
    return 0;

  if (*buf == '\0')                     // test for a little-endian wide string
  {
    if (len < (string - 1) * 2)
      return 0;

    for (++buf, len = string; --len > 0; buf += 2)
      if (buf[1] != '\0' || !isprint( *buf ))
	return 0;
    return 2;
  }

  for (len = string; --len > 0; ++buf)
    if (!isprint( *buf ))
      return 0;
  return 1;
}


/*
  Calculate the length to display.
*/
int calculate_len( unsigned char* buf, int len )
{
  int byte;
  int quote = 0;
  int pos = 0;
  int str = 0;
  int chrs = 0;
  int cnt;

  for (cnt = 0; len > 0; ++cnt, --len)
  {
    byte = *buf++;
    if (string && isprint( byte ) && (chrs == 1 ||
	 (chrs == 2 && *buf == '\0') ||
	 (chrs == 0 && (chrs = is_string( buf, len - 1 )) != 0)))
    {
      if (!quote)
      {
	quote = 1;
	str = cnt;
	if (!space && cnt != 0)
	  ++pos;
	if (chrs == 2)
	  ++pos;
	++pos;
      }
      ++pos;
      if (byte == '"')
	++pos;
      if (chrs == 2)
	++cnt, --len, ++buf;
    }
    else
    {
      if (quote)
	quote = chrs = 0, pos += 2;	// Quote & space
      else if (space && len != 1 && pos + 3 <= width)
	++pos;
      pos += 2;
    }
    if (pos > width - quote)
    {
      if (quote && str != 0)
	cnt = str;
      break;
    }
  }
  return cnt;
}


/*
  Display len bytes from buf.  Printable characters are displayed within
  quotes (unless there's only one); otherwise two hexadecimal digits are
  used.  If the character is a quote, another quote is also displayed
  (eg. """Hello,"" I said.").
  Returns the number of characters written.
*/
int display( unsigned char* buf, int len )
{
  int byte;
  int quote = 0;
  int out = 0;
  int chrs = 0;

  if (len < 0)
  {
    len = -len;
    while (--len >= 0)
      printf( "%02X", buf[len] );
    putchar( 'x' );
  }
  else
  {
    for (; len > 0; --len)
    {
      byte = *buf++;
      if (string && isprint( byte ) && (chrs == 1 ||
	   (chrs == 2 && *buf == '\0') ||
	   (chrs == 0 && (chrs = is_string( buf, len - 1 )) != 0)))
      {
	if (chrs == 2)
	  --len, ++buf;
	if (!quote)
	{
	  quote = 1;
	  if (!space && out != 0)
	    putchar( ' ' ), ++out;
	  if (chrs == 2)
	    putchar( 'L' ), ++out;
	  putchar( '"' ), ++out;
	}
	putchar( byte ), ++out;
	if (byte == '"')
	  putchar( '"' ), ++out;
      }
      else
      {
	if (quote)
	{
	  quote = chrs = 0;
	  putchar( '"' ), ++out;
	  putchar( ' ' ), ++out;
	}
	out += printf( "%02X", byte );
	if (space && len != 1)
	  putchar( ' ' ), ++out;
      }
    }
    if (quote)
      putchar( '"' ), ++out;
  }

  return out;
}