/* bwdiff.c - Byte-wise Difference: diff two almost-identical files. This program is used to display the bytes changed between two files. The bytes from the first (changed) file are displayed first, followed by the bytes from the second (original) file, in square brackets. Bytes that are printable (ASCII) are displayed as characters enclosed in quotes (a quote itself is duplicated). Jason Hood, 7 September, 1998. Public Domain. 981228: display the changed filename, or optionally the original filename; if one or two bytes are identical within a difference, include them in the diff list. 000222: stop diffing if too different; display message if identical. 000303: changed message if too different. 001001: split difference across lines; option to disable string translation. v1.00, 15 & 16 October, 2003: better handling of different sized files; display the count if diff is too big; surround the filename in quotes if it contains spaces. v1.01, 7 September, 2004: use a buffer to store the different bytes; increase default maximum to 88 bytes; line up the last line of a multi-line diff; require four printable characters before using a string; calculate the lengths of both files, use the smaller; modified return codes. v1.10, 25 to 27 October, 2012: options to select the bytes per line and the space between bytes; increased default maximum to 160 bytes; option to select the minimum string size; warn if the files have different lengths; display nothing if the files are identical; split a line at a string; more flexible option parsing; detect wide (two-byte little-endian) strings; opton to form two- to four-byte diffs as little-endian. */ #define PVERS "1.10" #define PDATE "27 October, 2012" #include #include #include #include #ifdef __DJGPP__ void __crt0_load_environment_file( char* dummy ) { } char** __crt0_glob_function( char* dummy ) { return 0; } #endif #ifdef __MINGW32__ int _CRT_glob = 0; #endif // Put back the macro version of getc, which *greatly* improves performance // when using /MD. #if defined(_MSC_VER) && !defined(getc) #define getc(_stream) (--(_stream)->_cnt >= 0 \ ? 0xff & *(_stream)->_ptr++ : _filbuf(_stream)) #endif int string = 4; // Display strings from this many printables int width; // characters to display bytes int space = 1; // space between bytes int little; // use little-endian style for small diffs int calculate_len( unsigned char* buf, int len ); int display( unsigned char* buf, int len ); enum { E_SAME, // Files are identical E_DIFF, // Files are different E_TOO, // Files are too different E_OPT, // Unknown/invalid option E_MEM, // Not enough memory for buffer E_NEW, // Unable to open changed file E_OLD, // Unable to open original file }; #define STRING static const char STRING zUnknownOpt[] = "Unknown option: %s\n"; STRING zTwoFiles[] = "Two file names are required.\n"; STRING zNoMemory[] = "Not enough memory.\n"; STRING zBadFile[] = "%s: unable to open.\n"; STRING zWarnSize[] = "Warning: files have different sizes.\n"; STRING zTooMuch[] = "Difference is too great! (%d bytes found.)\n"; int main( int argc, char* argv[] ) { FILE* file1; // Changed file FILE* file2; // Original file long offset; // Offset of difference int byte1, byte2; // The two bytes unsigned char* buf1; // Changed bytes unsigned char* buf2; // Original bytes unsigned char* disp1; // Changed bytes being displayed unsigned char* disp2; // Original bytes being displayed int len; // Length of difference int cnt1, cnt2; // Length of difference to display int pad; // Padding spaces for alignment int maxlen = 160; // Stop diffing if length exceeds this int out = 1; // Filename to display int different = E_SAME; // Were any differences found? if (argc == 1 || strcmp( argv[1], "/?" ) == 0 || strcmp( argv[1], "-?" ) == 0 || strcmp( argv[1], "--help" ) == 0) { puts( "Byte-wise Difference by Jason Hood .\n" "Version " PVERS " (" PDATE "). Public Domain.\n" "http://misc.adoxa.cjb.net/\n" "\n" "Display the byte differences between two almost-identical files.\n" "\n" "bwdiff [-bnoswx] \n" "\n" " - no more than n bytes should differ (default is 160)\n" " -b bytes only (no strings; same as -s0)\n" " -n no space between bytes\n" " -o use original file name, not changed\n" " -s choose the minimum string length (default is 4)\n" " -w use n bytes per line (default is 11, or 16 with -n)\n" " -x use little-endian for two- to four-byte diffs" ); return 0; } if (strcmp( argv[1], "--version" ) == 0) { puts( "Byte-wise Difference version " PVERS " (" PDATE ")." ); return 0; } while (argc > 1 && argv[1][0] == '-' && argv[1][1] != '\0') { ++argv[1]; while (*argv[1]) { switch (*argv[1]) { case 'b': string = 0; ++argv[1]; break; case 'n': space = 0; ++argv[1]; break; case 'o': out = 2; ++argv[1]; break; case 's': string = (int)strtol( argv[1] + 1, &argv[1], 0 ); break; case 'w': width = (int)strtol( argv[1] + 1, &argv[1], 0 ); break; case 'x': little = 1; ++argv[1]; break; default: { char* opt = argv[1]; maxlen = (int)strtol( argv[1], &argv[1], 0 ); if (maxlen == 0) { fprintf( stderr, zUnknownOpt, opt ); return E_OPT; } } break; } } ++argv; --argc; } if (argc < 3) { fputs( zTwoFiles, stderr ); return E_OPT; } if (width <= 0) width = 32; else if (space) width = width * 3 - 1; else width *= 2; // Allocate an extra byte to assist wide string detection. if ((buf1 = malloc( (maxlen + 1) * 2 )) == NULL) { fputs( zNoMemory, stderr ); return E_MEM; } buf2 = buf1 + maxlen + 1; if ((file1 = fopen( argv[1], "rb" )) == NULL) { fprintf( stderr, zBadFile, argv[1] ); return E_NEW; } if ((file2 = fopen( argv[2], "rb" )) == NULL) { fprintf( stderr, zBadFile, argv[2] ); return E_OLD; } fseek( file1, 0, SEEK_END ); fseek( file2, 0, SEEK_END ); if (ftell( file1 ) != ftell( file2 )) fputs( zWarnSize, stderr ); rewind( file1 ); rewind( file2 ); for (;;) { do // Skip identical bytes { byte1 = getc( file1 ); byte2 = getc( file2 ); } while (byte1 == byte2 && byte1 != EOF); if ((byte1 | byte2) == EOF) break; if (different == E_SAME) { fputs( "File: ", stdout ); if (strchr( argv[out], ' ' ) == NULL) puts( argv[out] ); else printf( "\"%s\"\n", argv[out] ); } different = E_DIFF; offset = ftell( file1 ) - 1; // Pointing at byte after first diff. printf( "%06lX: ", offset ); len = 0; // Count the different bytes for (;;) { ++len; byte1 = getc( file1 ); byte2 = getc( file2 ); if ((byte1 | byte2) == EOF) break; if (byte1 == byte2) { byte1 = getc( file1 ); byte2 = getc( file2 ); if ((byte1 | byte2) == EOF) break; if (byte1 == byte2) { byte1 = getc( file1 ); byte2 = getc( file2 ); if (byte1 == byte2 || (byte1 | byte2) == EOF) break; ++len; } ++len; } } if (len > maxlen) { printf( zTooMuch, len ); return E_TOO; } // Invalidate the extra byte, in case it's not really there. buf1[len] = buf2[len] = 0xFF; fseek( file1, offset, SEEK_SET ); fseek( file2, offset, SEEK_SET ); fread( buf1, len + 1, 1, file1 ); fread( buf2, len + 1, 1, file2 ); pad = 0; if (little && len >= 2 && len <= 4) { display( buf1, -len ); putchar( '\t' ); fputs( "[ ", stdout ); display( buf2, -len ); puts( " ]" ); } else for (disp1 = buf1, disp2 = buf2; ; disp1 += cnt1, disp2 += cnt1) { cnt1 = calculate_len( disp1, len ); cnt2 = calculate_len( disp2, len ); if (cnt2 < cnt1) cnt1 = cnt2; cnt2 = display( disp1, cnt1 ); // Display changed bytes // If there's more than one line, always use maximum width. if (disp1 == buf1 && cnt1 != len && cnt2 < width) pad = width; if (pad - cnt2 > 0) { printf( "%*c", pad - cnt2, ' ' ); cnt2 = pad; } if (cnt2 >= 40 - 8) putchar( ' ' ), putchar( ' ' ); else putchar( '\t' ); fputs( "[ ", stdout ); display( disp2, cnt1 ); // Display original bytes puts( " ]" ); len -= cnt1; if (len == 0) break; fputs( " ", stdout ); // eight spaces ("offset: ") pad = cnt2; } } return different; } /* Determine if the bytes form a printable string. */ int is_string( unsigned char* buf, int len ) { if (len < string - 1) // already processed one char on entry return 0; if (*buf == '\0') // test for a little-endian wide string { if (len < (string - 1) * 2) return 0; for (++buf, len = string; --len > 0; buf += 2) if (buf[1] != '\0' || !isprint( *buf )) return 0; return 2; } for (len = string; --len > 0; ++buf) if (!isprint( *buf )) return 0; return 1; } /* Calculate the length to display. */ int calculate_len( unsigned char* buf, int len ) { int byte; int quote = 0; int pos = 0; int str = 0; int chrs = 0; int cnt; for (cnt = 0; len > 0; ++cnt, --len) { byte = *buf++; if (string && isprint( byte ) && (chrs == 1 || (chrs == 2 && *buf == '\0') || (chrs == 0 && (chrs = is_string( buf, len - 1 )) != 0))) { if (!quote) { quote = 1; str = cnt; if (!space && cnt != 0) ++pos; if (chrs == 2) ++pos; ++pos; } ++pos; if (byte == '"') ++pos; if (chrs == 2) ++cnt, --len, ++buf; } else { if (quote) quote = chrs = 0, pos += 2; // Quote & space else if (space && len != 1 && pos + 3 <= width) ++pos; pos += 2; } if (pos > width - quote) { if (quote && str != 0) cnt = str; break; } } return cnt; } /* Display len bytes from buf. Printable characters are displayed within quotes (unless there's only one); otherwise two hexadecimal digits are used. If the character is a quote, another quote is also displayed (eg. """Hello,"" I said."). Returns the number of characters written. */ int display( unsigned char* buf, int len ) { int byte; int quote = 0; int out = 0; int chrs = 0; if (len < 0) { len = -len; while (--len >= 0) printf( "%02X", buf[len] ); putchar( 'x' ); } else { for (; len > 0; --len) { byte = *buf++; if (string && isprint( byte ) && (chrs == 1 || (chrs == 2 && *buf == '\0') || (chrs == 0 && (chrs = is_string( buf, len - 1 )) != 0))) { if (chrs == 2) --len, ++buf; if (!quote) { quote = 1; if (!space && out != 0) putchar( ' ' ), ++out; if (chrs == 2) putchar( 'L' ), ++out; putchar( '"' ), ++out; } putchar( byte ), ++out; if (byte == '"') putchar( '"' ), ++out; } else { if (quote) { quote = chrs = 0; putchar( '"' ), ++out; putchar( ' ' ), ++out; } out += printf( "%02X", byte ); if (space && len != 1) putchar( ' ' ), ++out; } } if (quote) putchar( '"' ), ++out; } return out; }