/*

 File:      findssn.c
 Author:    Graham Toal <gtoal@utpa.edu>
 Copyright: University of Texas Pan American
 Created:   200511150948

 Revision history:

            200511150948 Initial version
            200511151307 First release
            200511181514 Fixed state machine to be more strict
                         (it accepted 99-9999999 and 99999-9999 as
                          valid SSN formats)
            200605110853 Added directory traversal rather than one-shot.
            200606200847 Added hash table for better SSN recognition
            200607031014 Unix port
*/          static char *version = "200607031014"; /*

 Description:

   This is a text file scanner, which detects strings specifically
   of the exact forms "999-99-9999" or "999999999", which logs
   a summary of the content and count of SSNs in each file
   it scans.  It searches a disk tree hierarchically.

   The program checks that the target is in a valid SSN range,
   to help reduce false positives.  The ratio of valid to invalid
   9 digit numbers is noted, in order to eliminate those
   files with many numbers where some of them would be
   SSN-compatible by chance.  (Still some work to be done on
   automating this better.)


 TO DO:

   Reject SSNs which are 9 digits with no "-"s if they are
   immediately followed by alpha.  (long hex strings)


 BUGS:

   Looks like on some WinXP systems the tree walking gets into an
   infinite loop.  May be due to symbolic links?  Or something
   to do with trash folder?  Need to test unix version on loop
   of symlinks.

----------------------------------------------------------
*/

/* Standard C libs */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <errno.h>

#ifndef FALSE
#define FALSE (0!=0)
#define TRUE (0==0)
#endif

// This code is a bit hacky in that it uses a lot of
// globals for communication, which ideally would be
// passed as parameters...

static FILE *ssnfile;
static char *progname;
static char *filename;
static int ssncount = 0;
#ifdef NEVER
static int boguscount = 0;
#endif
static int lineno = 1;
static int debug = FALSE, verbose = FALSE;
static int maxgroup[1000];

#ifdef CHECK_HASHED_SSNLIST
// hashfun for local UTPA SSN checking.
#include "ssn-hash.h"
#define BITS 10
int hashfun(char *s, int max)
{
  int c;
  int rem;
  int h = 0;

  for (;;) {
    c = *s++;
    if (c == '\0') break;
    if (isalpha(c)) c = c - (isupper(c) ? 'A' : 'a');
    h = (h + (c&15) + 1) * 10;
    for (;;) {
      /* I suspect this can only be executed twice at most but
         I'm lazy and paranoid, hence a loop */
      rem = h / max;
      h = (h % max) + rem;
      if (h < max) break;
    }
  }
  return(h);
}
#endif

// flex array to eliminate duplicates
long *seen = NULL;
int seen_nextfree = 0;
int seen_max = 0;

// note - semantics of realloc means no special initialisation needed
static long *makespace(long *c, int nextfree, int *arraysize, int objsize) {
  if (nextfree >= *arraysize) {
    *arraysize = (*arraysize * 2) + 1;
    c = (long *)realloc(c, (*arraysize+1) * objsize); // 0:arraysize, inclusive. eg 0:15, 0:127 etc
    if (c == NULL) {fprintf(stderr, "findssn: %s\n", strerror(errno)); exit(errno);}
  }
  return c;
}

int known_else_add(long ssn)
{
  int i;

  seen = makespace(seen, seen_nextfree, &seen_max, sizeof(long)); // always extend flex array before use.

  seen[seen_nextfree] = ssn;
  i = 0;
  for (;;) {
    if (seen[i] == ssn) break;
    i = i+1;
  }
  if (i == seen_nextfree) {
    seen_nextfree = seen_nextfree+1; return(FALSE);
  }
  return(TRUE);
}



// see http://www.codecomments.com/archive266-2005-3-410229.html,
// http://www.hri.org/info/help/ssninfo.txt,
// and http://www.ssa.gov/employer/highgroup.txt

void init_groups(void) {
  int i;
  for (i = 0; i < 1000; i++) maxgroup[i] = -1;
  // data extracted from http://www.ssa.gov/employer/highgroup.txt using
  // an editor macro
#include "maxgroup.c"
}

int illegal_ssn(char *ssn)
{
  // explicitly defined by SSA as invalid?
  // Needs a database lookup plus data from SSA.
  return FALSE;
}

// SSN vetting does not have to be 100%.  It is OK to flag some
// invalid SSNs as legal, as long as we don't flag legal SSNs as illegal.

int validgroup(int area, int group)
{
  int cur, even, under10;
  if (maxgroup[area] < 0) return FALSE;

  cur = maxgroup[area];
  even = ((cur&1) == 0);
  under10 = (cur < 10);

  if (debug) fprintf(stderr, "Our SSN's area is %d and group is %d. "
                             " max group for %d is %d\n",
                             area, group, area, cur);

  if (!even && under10) {
    if (debug) fprintf(stderr, "group is odd and < 10\n");
    // our group must therefore also be odd and < 10
    if (group > cur) return FALSE; // range check
    return ((group&1) != 0) && (group < 10);
  }

  if (even && !under10) {
    if (debug) fprintf(stderr, "group is even and >= 10, "
                               "which also allows odd and < 10\n");
    // our group may be odd and < 10, or even and >= 10
    // first range check:
    if (group > cur) return FALSE; // range check
    return (((group&1) != 0) && (group < 10)) 
        || (((group&1) == 0) && (group >= 10));
  }

  if (even && under10) {
    if (debug) fprintf(stderr, "group is even and < 10, "
                               "which also allows even and >= 10, "
                               "plus odd and < 10\n");
    // only illegal group would be if odd and >= 10  (note reversed logic)
    return (!(((group&1) != 0) && (group >= 10)));
  }

  // group must be odd and >= 10.
  // All groups now allowed, modulo range check if odd && >= 10.
  if (debug) fprintf(stderr, "group is odd and >= 10, which means "
                             "anything goes (but can be range checked "
                             "if our group is also odd)\n");
  if (((group&1) != 0) && (group >= 10) && (group > cur)) return FALSE;
  return TRUE;
}

int validate(char *orig)
{
  // First, remove '-' and make canonical.
  static char ssn[12], area[4], group[3], serial[5];
  int iarea, igroup, iserial;

  if (strlen(orig) > 11) /* INTERNAL ERROR */;
  memmove(ssn, orig, 12);
  if (ssn[3] == '-') memmove(ssn+3, ssn+4, 8);
  if (ssn[5] == '-') memmove(ssn+5, ssn+6, 6);

  if (illegal_ssn(ssn)) return 1;

  area[0] = ssn[0];
  area[1] = ssn[1];
  area[2] = ssn[2];
  area[3] = '\0';

  group[0] = ssn[3];
  group[1] = ssn[4];
  group[2] = '\0';

  serial[0] = ssn[5];
  serial[1] = ssn[6];
  serial[2] = ssn[7];
  serial[3] = ssn[8];
  serial[4] = '\0';

  iarea = atoi(area);
  igroup = atoi(group);
  iserial = atoi(serial);

  if (iserial == 0 || igroup == 0) return 1; /* invalid */

/* no longer needed now we have the table ...
  if (iarea == 0 || 
      iarea == 666 || 
      (729 <= iarea && iarea <= 749) || 
      (764 <= iarea && iarea <= 999)) return 1;
 */
  if (!validgroup(iarea, igroup)) return 1;

  // Probably more I can do here, but this is working well enough.

  return 0; /* valid */
}

static int known9, unknown9, invalid9, known11, unknown11, invalid11; // = 0

void ssn_stats(char *ssn, int lineno)
{
  int known = 0, unknown = 0, invalid = 0, i, hashpos, set = 9;
  long ssn_long = atol(ssn);

  if (known_else_add(ssn_long)) {
    // Seen before, so ignore it.
    return;
  }

  ssncount++;

  if (ssn[3] == '-') {
    set = 11;
  }

#ifdef CHECK_HASHED_SSNLIST
  hashpos = hashfun(ssn, N * BITS);
  if ((ssn[0] != '0' && ssn[1] != '0') && ((hashtable[hashpos >> 3] & (1 << (hashpos & 7))) != 0)) {
    known += 1;
  } else {
#endif
    if (validate(ssn) == 0) unknown += 1; else invalid += 1;
#ifdef CHECK_HASHED_SSNLIST
  }
#endif

#ifdef NEVER
  boguscount += (i = validate(ssn));
  // no more than 10 to be printed per file.  Some files may have 100,000+ ...
  if (ssncount < 10) {

    hashpos = hashfun(ssn, N * BITS);
    if (  (  hashtable[hashpos >> 3] & 
             (1 << (hashpos & 7))
          ) != 0
       ) {
      fprintf(stdout, "\"%s\", line %d: %s%s\n", filename, lineno, ssn,
                      i == 0 ? " (Real?!)" : " (Bogus?)");

    } else {
      // bad ...
      fprintf(stdout, "\"%s\", line %d: %s%s\n", filename, lineno, ssn,
                        i == 0 ? "" : " (Bogus?)");
    }
  }
#endif

  if (set == 9) {
    known9 += known;
    unknown9 += unknown;
    invalid9 += invalid;
  } else {
    known11 += known;
    unknown11 += unknown;
    invalid11 += invalid;
  }

  // Summarizing needs to be improved.  Probably better to output
  // to a buffer and then throw the buffer away entirely if we decide
  // that the file didn't contain SSNs.

  // Also we want to parameterize the command line so that we can
  // either look for single SSNs or more, or only files with more
  // than some given parameter of SSNs (eg 1000) to find proper
  // databases, rather than odd emails etc.

  // Need to look at ratio of good to bogus too, when the numbers
  // are large.

}

/* Main procedure to scan input file and identify sequences of
   digits which look like SSNs, i.e. nnn-nn-nnnn or nnnnnnnnn
   *without* any extra digits before or after. */

void state_machine(int c)
{

#define STATE_ANY 0
#define STATE_DIGIT1 1
#define STATE_DIGIT2 2
#define STATE_DIGIT3 3
#define STATE_DASH1  4
#define STATE_DIGIT4 5
#define STATE_DIGIT5 6
#define STATE_DASHDIGIT4 15
#define STATE_DASHDIGIT5 16
#define STATE_DASH2  7
#define STATE_DIGIT6 8
#define STATE_DIGIT7 9
#define STATE_DIGIT8 10
#define STATE_DIGIT9 11
#define STATE_GOBBLE_DIGITS 12

  static int state = STATE_ANY;

  static char buffer[12] = { 0 }; // 9 digits, 2 dashes, and trailing NUL.
  static char *ssn = buffer;

  if (c == '\n') lineno++; // if needed for diagnostics

  if (c != EOF) switch (state) {
  case STATE_ANY:
    if (isdigit(c)) {*ssn++ = c; state = STATE_DIGIT1; return;}
    /*putchar(c);*/
    return;
  case STATE_DIGIT1:
    if (isdigit(c)) {*ssn++ = c; state = STATE_DIGIT2; return;}
    break;
  case STATE_DIGIT2:
    if (isdigit(c)) {*ssn++ = c; state = STATE_DIGIT3; return;}
    break;
  case STATE_DIGIT3:
    if (c == '-') {*ssn++ = c; state = STATE_DASH1; return;}
    if (isdigit(c)) {*ssn++ = c; state = STATE_DIGIT4; return;}
    break;
  case STATE_DASH1:
    if (isdigit(c)) {*ssn++ = c; state = STATE_DASHDIGIT4; return;}
    break;
  case STATE_DASHDIGIT4:
    if (isdigit(c)) {*ssn++ = c; state = STATE_DASHDIGIT5; return;}
    break;
  case STATE_DIGIT4:
    if (isdigit(c)) {*ssn++ = c; state = STATE_DIGIT5; return;}
    break;
  case STATE_DASHDIGIT5:
    if (c == '-') {*ssn++ = c; state = STATE_DASH2; return;}
    break;
  case STATE_DIGIT5:
    if (isdigit(c)) {*ssn++ = c; state = STATE_DIGIT6; return;}
    break;
  case STATE_DASH2:
    if (isdigit(c)) {*ssn++ = c; state = STATE_DIGIT6; return;}
    break;
  case STATE_DIGIT6:
    if (isdigit(c)) {*ssn++ = c; state = STATE_DIGIT7; return;}
    break;
  case STATE_DIGIT7:
    if (isdigit(c)) {*ssn++ = c; state = STATE_DIGIT8; return;}
    break;
  case STATE_DIGIT8:
    if (isdigit(c)) {*ssn++ = c; state = STATE_DIGIT9; return;}
    break;
  case STATE_DIGIT9:
    if (isdigit(c) /* || isalpha(c) */) {
      // if we have another digit, it was not an SSN
      // Am considering changing the test to isalnum()
      // because of some hex strings with accidental overlap
      state = STATE_GOBBLE_DIGITS;
      *ssn = '\0'; /*fprintf(stdout, "%s", buffer);*/ ssn = buffer; fflush(stdout);
    } else {
      // otherwise it was an SSN, so log it.
      state = STATE_ANY;
      *ssn = '\0';
      ssn_stats(buffer, lineno);
      // fprintf(stdout, "%s\n", convert_ssn(buffer, lineno, fab, rab));
      ssn = buffer; fflush(stdout);
    }
    // Fall through
  case STATE_GOBBLE_DIGITS:
    // we want to explicitly discard trailing digits after a 9-digit sequence
    // rather than switch to STATE_ANY because otherwise an 18-digit sequence would
    // discard the first 9 digits then accept the next 9 as an SSN.
    if (!isdigit(c)) state = STATE_ANY; // end of a number sequence
    /*putchar(c);*/
    return;
  }
  // All 'break' statements above imply flush() and state = STATE_ANY ...
  *ssn = '\0'; /*fprintf(stdout, "%s", buffer);*/ ssn = buffer; fflush(stdout);
  /*if (c != EOF) putchar(c);*/
  state = STATE_ANY;
}

char *last60(char *s)
{
  int l = strlen(s);
  if (l > 54) return s + l - 54;
  return s;
}

void DoOneFile(void) {
  long long int block = 0LL;
  // potentially could eliminate .dll, .bmp, .exe etc at this point.
  // for now, search *everything*.  Maximal paranoia mode.


  seen_nextfree = 0; // reset ptr (but not array size) on each file.

  ssnfile = fopen(filename, "rb");
  if  (ssnfile == NULL) {
    if (errno != EACCES) {
      fprintf(stderr, "%s: %s - %s\n", progname, filename, strerror(errno));
    }
    return; // may be locked etc.  Not fatal if cannot open every file...
            // (also, directories give 'permission denied')
  }
  ssncount = 0;
#ifdef NEVER
  boguscount = 0;
#endif
  lineno = 1;

  // for bogisity tests:
  known9 = 0;
  unknown9 = 0;
  invalid9 = 0;
  known11 = 0;
  unknown11 = 0;
  invalid11 = 0;

  /*************** MAIN LOOP WITHIN A FILE ***************/
  for (;;) {
    if (((block & 0xffffLL) == 0x0000LL) && verbose) fprintf(stderr,
                   "Processing %s %06llx  \r",
                   last60(filename), block>>16LL);
    int c = fgetc(ssnfile);
    if (c == '\r') continue;
    state_machine(c);
    if (c == EOF) break;
    block += 1LL;
  }
  if (verbose) fprintf(stderr,
                   "                                                                               \r");
  /***************   END OF ONE FILE LOOP  ***************/
  fclose(ssnfile);


  // We can either do the filtering algorithm here, or defer
  // it to a post-processing phase that has the CSV file as input.

  // However in neither case are we particularly interested in the
  // huge numbers of files for which all counters are zero.

  /*if ( ((known9 > 1) || (known11 > 1))
      && (known9 || unknown9 || known11 || unknown11)
      && ((known9 > unknown9) || (unknown9 > invalid9)
         || (known11 > unknown11) || (unknown11 > invalid11)))*/ 
  if (known9 || unknown9 || known11 || unknown11) /* are any non-0? */
    fprintf(stdout, "%d,%d,%d,%d,%d,%d,\"%s\"\n",
            known9, unknown9, invalid9,
            known11, unknown11, invalid11, filename);
}

#ifdef __unix__
#include <sys/types.h>
#include <sys/stat.h>
#include <dirent.h>

int ForAllFiles(int level, const char *dirname) {
  DIR *dd;
  struct dirent *dent;
  struct stat statbuf;
  char *fname;
  int scanret = 0;

    if ((dd = opendir(dirname)) != NULL) {
	while ((dent = readdir(dd))) {
#ifndef C_INTERIX
	    if (dent->d_ino)
#endif
	    {
		if (strcmp(dent->d_name, ".") && strcmp(dent->d_name, "..")) {
		    /* build the full name */
		    fname = calloc(strlen(dirname) + strlen(dent->d_name) + 2, sizeof(char));
		    sprintf(fname, "%s/%s", dirname, dent->d_name);

		    /* stat the file */
		    if (lstat(fname, &statbuf) != -1) {
			if (S_ISDIR(statbuf.st_mode) && !S_ISLNK(statbuf.st_mode)) {
			    ForAllFiles(level+1, fname);
			} else {
			    // PERFORM ACTION: 
			    if(S_ISREG(statbuf.st_mode)) {
			      filename = fname;
			      DoOneFile();
			    }
                        }
		    }
		    free(fname);
		}
	    }
	}
    } else {
	if (verbose) fprintf(stderr, "%s: Can't open directory %s\n", progname, dirname);
	return 53;
    }
    closedir(dd);
    return (scanret ? 1 : 0);
}
#else

#ifdef OLDSTYLE /* DOS API. Fails on some machines - infinite loop */

/* Hacked from "dir.c" at http://www-informatik.hs-harz.de/NVMCK/Labor_sapi.doc ... */

/* non-standard libs for findfirst etc under LCC compiler on Windows. Possibly others? */

#include <io.h>
#include <direct.h>

int ForAllFiles(int level, char *param) {
  static char current[256];
  struct _finddata_t ft;
  long int lf;
  int bMore = FALSE;

  if (level == 0) {
    _getcwd(current, sizeof(current)-1);
    _chdir(param); // if dir as a param, do this
  }
  lf = _findfirst( "*.*", &ft);
  bMore = (lf != -1L);
  while (bMore) {
    if  ( ft.name[0] != '.')  {
      if  (ft.attrib & _A_SUBDIR) {
        _chdir(ft.name);
        ForAllFiles(level+1, param);
        _chdir("..");
      } else {
        _getcwd(current, sizeof(current)-1);
        strcat(current, "\\");
        strcat(current, ft.name);
        filename = current; // another nasty global
        DoOneFile(); // BUT WOULD PREFER NOT TO IF IT IS A DIRECTORY!!!
                     // (relying on dir giving EACCES in WinXX)
      }
    }
    bMore = !_findnext(lf, &ft) ;
  }
  if (level == 0)_chdir(current);
  return 1;
}
#else /* WIN32 API */
#include <io.h>
#ifdef TRUE
#undef TRUE
#undef FALSE
#endif
#ifdef N
#undef N
#endif
#include <windows.h>

/*

  Program to walk the directory tree (from a nominated starting point)
  and print the names of all files found (excepting directories)


  Author:     Nick Gammon  <nick@gammon.com.au>
  Hacked by:  Graham Toal
  Web:        http://www.gammon.com.au

  Date:       8th August 1999
  Hacked on:  6th July 2006

  Usage:  treeinfo <starting_point>

  eg.     treeinfo c:\games

*/


void PrintWin32Error (DWORD ErrorCode)
{
 
  LPVOID lpMsgBuf;
  FormatMessage (FORMAT_MESSAGE_ALLOCATE_BUFFER |  
                 FORMAT_MESSAGE_FROM_SYSTEM |     
                 FORMAT_MESSAGE_IGNORE_INSERTS,    
                 NULL,
                 ErrorCode,
                 MAKELANGID (LANG_NEUTRAL, SUBLANG_DEFAULT), // Default language
                 (LPTSTR) &lpMsgBuf,    
                 0,    
                 NULL );
	fprintf(stderr, "%s", lpMsgBuf );
	LocalFree( lpMsgBuf );
}

void process_dir (char * dirname, 
                  char * subdirname, 
                  int level,
                  __int64 * totalsize,
                  long * totalfiles)
  {
  char * thisdir = NULL;
  WIN32_FIND_DATA fileinfo;
  HANDLE hdl;
  double mb;

  // we add 4 below for:
  //   null at end: 1
  //   \ between dir and subdir name : 1
  //   \* at end: 2

  thisdir = malloc (strlen (dirname) + strlen (subdirname) + 4);
  if (!thisdir)
    {
    fprintf(stderr, "*** Unable to allocate memory for directory name: %s\\%s\n", 
            dirname, subdirname);
    return;
    }

  strcpy (thisdir, dirname);
  if (strlen (subdirname) > 0)
    {
    strcat (thisdir, "\\");
    strcat (thisdir, subdirname);
    }
  strcat (thisdir, "\\*");

  if (strlen (thisdir) > _MAX_PATH)
    {
    fprintf(stderr, "*** pathname: \"%s\" is too long to process\n", thisdir);
    free (thisdir);
    return;
    }

  hdl = FindFirstFile (thisdir, &fileinfo);
  
  thisdir [strlen (thisdir) - 2] = 0;   // remove wildcard from end

  if (hdl == INVALID_HANDLE_VALUE)
    {
    DWORD nErr = GetLastError ();

    if (level != 0 && nErr == ERROR_NO_MORE_FILES)
      {
      free (thisdir);
      return;
      }

    fprintf(stderr, "*** %s : ", thisdir);    
    PrintWin32Error (nErr);
    free (thisdir);
    return;
    }

  do
    {
    
    if (fileinfo.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)
      {
      if (strcmp (".", fileinfo.cFileName) != 0 &&
          strcmp ("..", fileinfo.cFileName) != 0)
        {
        __int64 total = 0;
        long    files = 0;
        process_dir (thisdir, fileinfo.cFileName, level + 1, &total, &files);
        *totalsize += total;
        *totalfiles += files;
        }
      }
    else
      {
      LARGE_INTEGER nSize;
      char pathname[1024*10]; // beware of buffer overflow.  Need to fix this.

      nSize.HighPart = fileinfo.nFileSizeHigh;
      nSize.LowPart = fileinfo.nFileSizeLow;

      *totalsize += nSize.QuadPart;
      (*totalfiles)++;
      // if (level == 0) ...
      mb = (double) nSize.QuadPart / 1024.0 / 1024.0;
      // fprintf(stdout, "%s\\%s \n", /*mb,*/ thisdir, fileinfo.cFileName);
      sprintf(pathname, "%s\\%s", thisdir, fileinfo.cFileName);
      filename = pathname;
      DoOneFile();
      }


    if (FindNextFile (hdl, &fileinfo) == 0)
      {
      DWORD nErr = GetLastError ();

      if (nErr != ERROR_NO_MORE_FILES)
        {
        fprintf(stderr, "*** %s : ", thisdir);    
        PrintWin32Error (nErr);
        }


      FindClose (hdl);

      // if level == 1 ...
        mb = (double) *totalsize / 1024.0 / 1024.0;

      free (thisdir);
      return;
      }

    } while (1);

  };


int ForAllFiles(int level, char *param) {
  __int64 total = 0;
  long    files = 0;

  char starting_point [_MAX_PATH] = "c:";
  
  if ((param != NULL) && (*param != '\0')) strcpy (starting_point, param);

  // remove trailing backslash from starting point
  if (starting_point [strlen (starting_point) - 1] == '\\')
    starting_point [strlen (starting_point) - 1] = 0;

  process_dir (starting_point, "", 0, &total, &files);

  return 0;

} 

#endif /* OLDSTYLE */
#endif /* not __unix__ , i.e. Windows */

int main (int argc, char **argv)
{
   int c;
   char *s;

   progname = (((s=strrchr(argv[0], '/')) != NULL ? s+1:
               (s=strrchr(argv[0], '\\')) != NULL ? s+1:
               argv[0]));

   if ((argc == 3) && (strcmp(argv[1], "-v") == 0)) {
     verbose = TRUE;
     argc -= 1;
     argv += 1;
   }
   if (argc != 2) {
     fprintf(stderr, "syntax: %s [-v] rootdir\n", progname);
     exit(EXIT_FAILURE);
   }

// BUG TO BE FIXED - PROBLEMS CAUSED BY TRAINING \ IN DIRECTORY NAME

   fprintf(stdout, "known9,unknown9,invalid9,known11,unknown11,invalid11,filename\n");
   init_groups();
   /****************** LOOP OVER ALL FILES ****************/
   (void)ForAllFiles(0, argv[1]);
   /************   END OF LOOP OVER ALL FILES   ************/

   exit(EXIT_SUCCESS);
   if (version != NULL) return(EXIT_FAILURE);
}