/* -*- Mode: C -*-
 *
 *  blocksort  sort a text file by blocks
 *
 *  $Id: blocksort.c,v 1.6 2003/05/06 02:53:52 bkorb Exp $
 */
#include "system.h"
#include "bsort-opt.h"

#define      MAX_MATCH  16

typedef struct step_res tStepRes;
struct step_res {
    int       braCt;
    char*     braStartList[MAX_MATCH];
    size_t    braLen[MAX_MATCH];
};

char*    pzText;
char     zErrBuf[ 128 ];
regex_t  patternRe;
regex_t  keyRe;
regex_t  trailerRe;
regex_t  startRe;

regmatch_t   aMatch[ MAX_MATCH ];
int          nmatch;

extern char* loadFile( void );
extern char* findBlock( char* pzScan );
extern char* findTail(  char* pzScan );
extern char* findStart( char* pzScan );
extern int   compareChunk( const void* one, const void* two );
extern void  trim( char* pzBlock );


int
main( int    argc,
      char** argv )
{
  char*  pzHead;
  char*  pzTail;
  char*  pzScan;

  size_t   blockCt   = 0;
  size_t   allocCt   = 0;
  char**   ppzBlList = (char**)NULL;

  optionProcess( &blocksortOptions, argc, argv );
  pzText = pzHead = loadFile();
  pzScan = findBlock( findStart( pzHead ));
  if (pzScan == (char*)NULL) {
    fprintf( stderr, "ERROR:  no matching patterns were found\n" );
    exit( EXIT_FAILURE );
  }

  for (;;) {
    char* pz;

    if (blockCt >= allocCt) {
      allocCt += 32;
      ppzBlList = (char**)xrealloc( (void*)ppzBlList,
                                    allocCt * sizeof( *ppzBlList ));
    }

    ppzBlList[ blockCt++ ] = pzScan;
    pz = findBlock( pzScan );
    if (pz == (char*)NULL) {
      pzTail = findTail( pzScan );
      break;
    }

    pzScan = pz;
  }

  if (HAVE_OPT( SPACING )) {
    char**  ppz = ppzBlList;
    int     ct  = blockCt;
    do  {
      trim( *(ppz++) );
    } while (--ct > 0);
  }
  OPT_VALUE_SPACING++;

  qsort( ppzBlList, blockCt, sizeof( *ppzBlList ), compareChunk );
  fputs( pzHead, stdout );
  fputc( '\n', stdout );

  while (blockCt-- > 0) {
    int  nlCt = OPT_VALUE_SPACING;
    fputs( *(ppzBlList++), stdout );
    do  {
      fputc( '\n', stdout );
    } while (--nlCt > 0);
  }

  if (pzTail != (char*)NULL)
    fputs( pzTail, stdout );
  fclose( stdout );
  return 0;
}

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *
 *  Subroutines:
 */

/*
 *  trim
 */
void
trim( char* pzBlock )
{
  char* pzEnd = pzBlock + strlen( pzBlock );
  while ((pzEnd > pzBlock) && isspace(pzEnd[-1]))  pzEnd--;
  *pzEnd = '\0';
}


/*
 *  runStep
 */
void
runStep( char* pz, tStepRes*  pRes )
{
  static int  stepCt = 0;
  regmatch_t* pRM = aMatch;
  int         ct  = MAX_MATCH;
  regoff_t    lim = strlen( pz );

#   ifdef DEBUG
  memset( (void*)pRes, 0, sizeof( *pRes ));
#   endif

  if (regexec( &keyRe, pz, MAX_MATCH, aMatch, 0 ) == 0) {
    do {
      /*
       *  This is a bogus comparison.
       *  The problem is that I found an implementation that
       *  put 0x4008EC80 into rm_so.  This is the hack that
       *  will work with a correct implementation.
       *  The value should be -1.
       */
      if ((unsigned)(pRM->rm_so) >= (unsigned)lim)
        break;

      pRes->braStartList[MAX_MATCH-ct] = pz + pRM->rm_so;
      pRes->braLen[      MAX_MATCH-ct] = pRM->rm_eo - pRM->rm_so;
      pRM++;
    } while (--ct > 0);

    pRes->braCt = MAX_MATCH - ct;

    if (HAVE_OPT( VERBOSE )) {
      stepCt++;
      fprintf( stderr, "%d compare chunks found after %5d\n", pRes->braCt,
               (u_int)(pz - pzText) );
    }
  } else {
    char sv = NUL;
    if (strlen( pz ) > 60) {
      sv = pz[60];
      pz[60] = NUL;
    }

    fprintf( stderr, "no compare chunks found at %d\n"
             "\n'%s' not found within\n\n%s\n",
             (u_int)(pz - pzText), OPT_ARG( KEY ), pz );
    if (sv != NUL)
      pz[60] = sv;
  }
}


/*
 *  compareChunk
 */
int
compareChunk( const void* one, const void* two )
{
  char* pzOne = *((char**)one);
  char* pzTwo = *((char**)two);

  if (! HAVE_OPT( KEY ))
    return strcmp( pzOne, pzTwo );

  {
    tStepRes  resOne;
    tStepRes  resTwo;
    int       idx;
    int       ctRes;
    int       minCt;

    runStep( pzOne, &resOne );
    runStep( pzTwo, &resTwo );

    /*
     *  See how many comparison regions each step result has.
     *  IF either (or both) have no sort keys,
     *  THEN the result is simply the comparisons of these counts.
     */
    ctRes = resOne.braCt - resTwo.braCt;
    switch (resOne.braCt) {
    case 1:
      /*
       *  First entry has only the full pattern match.
       *  If the second entry has a key match, then the
       *  first entry has a NULL key and will compare less.
       *  (second entry is greater than first, yield positive).
       */
      if (resTwo.braCt > 1)
        return ctRes;
      idx = 0;
      break;

    default:
      /*
       *  First entry has key entry matches.
       *  If the second entry has no key match, then the
       *  it has a NULL key and will compare less.
       *  (second entry is less than first, yield negative).
       */
      if (resTwo.braCt > 1) {
        idx = 1;  /* skip the full pattern */
        break;
      }
      /*FALLTHROUGH*/

    case 0:
      /*
       *  The comparison result is the number of match entries found
       */
      return ctRes;
    }

    /*
     *  Make sure we only compare existing regions
     */
    if (ctRes <= 0)
      minCt = resOne.braCt;
    else minCt = resTwo.braCt;

    /*
     *  FOR each existing region, ...
     */
    for (; idx < minCt; idx++) {
      /*
       *  Find out which region is longer
       */
      int  lenDiff = resOne.braLen[idx] - resTwo.braLen[idx];

      /*
       *  determine what the shorter length is
       */
      int  cmpLen  = ( (lenDiff <= 0)
                       ? resOne.braLen[idx]
                       : resTwo.braLen[idx] );

      /*
       *  Compare the two strings for the length of the shorter string.
       */
      int  cmpRes  = strncmp( resOne.braStartList[idx],
                              resTwo.braStartList[idx],
                              cmpLen );

      /*
       *  IF the comparison shows a difference,
       *  THEN show that result
       */
      if (cmpRes != 0)
        return cmpRes;

      /*
       *  Otherwise, IF one string is longer than the other
       *  THEN the result is the length difference
       */
      if (lenDiff != 0)
        return lenDiff;
    }

    /*
     *  IF none of the compared regions shows a difference,
     *  THEN the result is, again, merely a comparison of the
     *       region counts.
     */
    return ctRes;
  }
}


/*
 *  findBlock
 */
char*
findBlock( char* pzScan )
{
  static int  srchCt = 0;

  /*
   *  Always start by advancing to the next line
   *  Make sure there is one, then find the pattern.
   */
  pzScan = strchr( pzScan, '\n' );

  if (pzScan == (char*)NULL)
    return pzScan;

  if (regexec( &patternRe, pzScan, MAX_MATCH, aMatch, 0 ) != 0)
    return (char*)NULL;
  srchCt++;

  /*
   *  Search back to the start of the line.
   */
  if (aMatch[0].rm_so == -1) {
    fprintf( stderr, "%2d-th pattern had no offset\n", srchCt );
    return (char*)NULL;
  }

  {
    char* pz = pzScan + aMatch[0].rm_so;
    while ((pz > pzScan) && (*pz != '\n')) pz--;

    *(pz++) = NUL;

    if (HAVE_OPT( VERBOSE ))
      fprintf( stderr, "%2d-th pattern found at char %d\n", srchCt,
               (u_int)(pz - pzText) );

    return pz;
  }
}


/*
 *  findStart
 */
char*
findStart( char* pzScan )
{
  int   rules = REG_EXTENDED | REG_NEWLINE;
  int   rerr;
  tSCC  zBadExpr[] = "Error %d (%s) compiling pattern:\n\t'%s'\n";
  tSCC  zIsOkay[]  = "%8s regex: ``%s'' os okay.\n";

  if (HAVE_OPT( SYNTAX )) {
    int     ct = STACKCT_OPT(  SYNTAX );
    char**  pp = STACKLST_OPT( SYNTAX );

    do  {
      char* p = (*pp++);
      char* q = p;
      int   l = strlen( p );
      int   invert = 0;
      int   newrul;

      if (l < 1)
        continue;

      while (*q) {
        if (islower(*q))
          *q = toupper( *q );
        q++;
      }

      /*
       *  Accept a prefix of "NO" or "NOT" with or without a following
       *  hyphen to invert the sense.  i.e., turn off EXTENDED or NEWLINE.
       */
      if (strncmp( p, "NO", 2 ) == 0) {
        invert = 1;
        p += 2;
        l -= 2;
        if (*p == 'T') { p++; l--; }
        if (*p == '-') { p++; l--; }
      }

      if (strncmp( "EXTENDED", p, l ) == 0)
        newrul = REG_EXTENDED;
      else if (strncmp( "ICASE", p, l ) == 0)
        newrul = REG_ICASE;
      else if (strncmp( "NEWLINE", p, l ) == 0)
        newrul = REG_NEWLINE;
      else {
        tSCC zNot[] = "Error:  '%s' is invalid syntax selection\n";
        fprintf( stderr, zNot, p );
        USAGE( EXIT_FAILURE );
        /* NOTREACHED */
#ifndef WARNINGS_OKAY
        newrul = 0;
#endif
      }

      if (invert)
        rules &= ~newrul;
      else
        rules |= newrul;

    } while (--ct > 0);
  }

  if (HAVE_OPT( VERBOSE )) {
    if (rules == 0)
      fputs( "No regex option bits are set\n", stderr );
    else {
      fputs( "Regex option bits set: ", stderr );
      if (rules & REG_EXTENDED) fputs( " EXTENDED", stderr );
      if (rules & REG_ICASE)    fputs( " ICASE",    stderr );
      if (rules & REG_NEWLINE)  fputs( " NEWLINE",  stderr );
      fputc( '\n', stderr );
    }
  }

  /*
   *  PATTERN -- main segmentation pattern
   */
  rerr = regcomp( &patternRe, OPT_ARG( PATTERN ), rules );
  if (rerr != 0) {
    regerror( rerr, &patternRe, zErrBuf, sizeof( zErrBuf ));

    fprintf( stderr, zBadExpr, rerr, zErrBuf, OPT_ARG( PATTERN ));
    USAGE( EXIT_FAILURE );
  }
  if (HAVE_OPT( VERBOSE ))
    fprintf( stderr, zIsOkay, "Pattern", OPT_ARG( PATTERN ));

  /*
   *  KEY -- sort key selection
   */
  if (HAVE_OPT( KEY )) {
    rerr = regcomp( &keyRe, OPT_ARG( KEY ), rules );
    if (rerr != 0) {
      regerror( rerr, &keyRe, zErrBuf, sizeof( zErrBuf ));

      fprintf( stderr, zBadExpr, rerr, zErrBuf, OPT_ARG( KEY ));
      USAGE( EXIT_FAILURE );
    }
    if (HAVE_OPT( VERBOSE ))
      fprintf( stderr, zIsOkay, "Key", OPT_ARG( KEY ));
  }

  /*
   *  TRAILER -- trailer preservation pattern
   */
  if (HAVE_OPT( TRAILER )) {
    rerr = regcomp( &trailerRe, OPT_ARG( TRAILER ), rules );
    if (rerr != 0) {
      regerror( rerr, &trailerRe, zErrBuf, sizeof( zErrBuf ));

      fprintf( stderr, zBadExpr, rerr, zErrBuf, OPT_ARG( TRAILER ));
      USAGE( EXIT_FAILURE );
    }
    if (HAVE_OPT( VERBOSE ))
      fprintf( stderr, zIsOkay, "Trailer", OPT_ARG( TRAILER ));
  }

  if (! HAVE_OPT( START ))
    return pzScan;

  /*
   *  START -- start sequence preservation pattern
   */
  rerr = regcomp( &startRe, OPT_ARG( START ), rules );
  if (rerr != 0) {
    regerror( rerr, &startRe, zErrBuf, sizeof( zErrBuf ));

    fprintf( stderr, zBadExpr, rerr, zErrBuf, OPT_ARG( START ));
    USAGE( EXIT_FAILURE );
  }
  if (HAVE_OPT( VERBOSE ))
    fprintf( stderr, zIsOkay, "Start", OPT_ARG( START ));

  /*
   *  Find that start sequence now and return the address of the following text
   */
  rerr = regexec( &startRe, pzScan, MAX_MATCH, aMatch, 0 );

  if ((rerr != 0) || (aMatch[0].rm_eo == -1)) {
    regerror( rerr, &startRe, zErrBuf, sizeof( zErrBuf ));
    fprintf( stderr, "Error %d (%s)\n\tThe start expression was not found\n",
             rerr, zErrBuf );
    exit( EXIT_FAILURE );
  }

  pzScan = pzScan + aMatch[0].rm_eo;    

  /*
   *  Advance past the first newline following the start pattern
   *  (providing that the pattern does not end with a newline.)
   */
  if (pzScan[-1] != '\n')
    while (*pzScan != '\n') pzScan++;
  return pzScan;
}


/*
 *  findTail
 */
char*
findTail( char* pzScan )
{
  /*
   *  IF there is no trailer option OR it does not find a trailor
   *  THEN remove the trailing newline (it gets appended)
   *       and return (char*)NULL [no trailer indication]
   */
  if (  (! HAVE_OPT( TRAILER ))
     || (regexec( &trailerRe, pzScan, MAX_MATCH, aMatch, 0 ) != 0)) {

    if (*pzScan != NUL) {
      pzScan += strlen( pzScan );
      if (pzScan[-1] == '\n')
        pzScan[-1] = NUL;
    }
    return (char*)NULL;
  }

  /*
   *  Search back to the start of the line.
   */
  if (aMatch[0].rm_so == -1) {
    fputs( "trailer pattern had no offset\n", stderr );
    return (char*)NULL;
  }

  {
    char* pz = pzScan + aMatch[0].rm_so;

    while ((pz > pzScan) && (*pz != '\n')) pz--;

    /*
     *  Make sure that the last entry has *some* text
     */
    if (pz <= pzScan+1) {
      pzScan += strlen( pzScan );
      if (pzScan[-1] == '\n')
        pzScan[-1] = NUL;
      return (char*)NULL;
    }

    /*
     *  Trim off the trailer
     */
    *(pz++) = NUL;

    if (HAVE_OPT( VERBOSE ))
      fprintf( stderr, "trailer pattern found at char %d\n",
               (u_int)(pz - pzText) );

    return pz;
  }
}


/*
 *  loadFile
 */
char*
loadFile( void )
{
  size_t   bufSize;
  size_t   spaceLeft;
  char*    pBuf;
  char*    pScn;

  {
    struct stat stbuf;
    if (  (fstat( fileno( stdin ), &stbuf ) < 0)
       || (! S_ISREG( stbuf.st_mode ))  ) {
      spaceLeft = bufSize = 0x1000;
    } else {
      spaceLeft = bufSize = (stbuf.st_size + 0x800) & ~0x3FF;
    }
  }

  pScn = pBuf = (char*)xmalloc( bufSize );

  for (;;) {
    size_t readCt = fread( pScn, 1, spaceLeft, stdin );
    if (readCt == 0) {
      if (feof( stdin ))
        break;
      fprintf( stderr, "Error %d (%s) reading input\n",
               errno, strerror( errno ));
      exit( EXIT_FAILURE );
    }

    pScn      += readCt;
    spaceLeft -= readCt;

    if (spaceLeft < 0x400) {
      off_t off = (off_t)(pBuf - pScn);
      bufSize   += 0x1000;
      spaceLeft += 0x1000;
      pBuf = (char*)xrealloc( (void*)pBuf, bufSize );
      if (pBuf == (char*)NULL) {
        fprintf( stderr, "ERROR:  cannot reallocate 0x%08X bytes\n",
                 bufSize );
        exit( EXIT_FAILURE );
      }
      pScn = pBuf + off;
    }
  }

  fclose( stdin );
  return pBuf;
}

/*
 * Local Variables:
 * mode: C
 * c-file-style: "gnu"
 * tab-width: 8
 * indent-tabs-mode: nil
 * End:
 * end of blocksort.c */
