/*
This product contains certain software code or other information
("AT&T Software") proprietary to AT&T Corp. ("AT&T").  The AT&T
Software is provided to you "AS IS".  YOU ASSUME TOTAL RESPONSIBILITY
AND RISK FOR USE OF THE AT&T SOFTWARE.  AT&T DOES NOT MAKE, AND
EXPRESSLY DISCLAIMS, ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND
WHATSOEVER, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, WARRANTIES OF
TITLE OR NON-INFRINGEMENT OF ANY INTELLECTUAL PROPERTY RIGHTS, ANY
WARRANTIES ARISING BY USAGE OF TRADE, COURSE OF DEALING OR COURSE OF
PERFORMANCE, OR ANY WARRANTY THAT THE AT&T SOFTWARE IS "ERROR FREE" OR
WILL MEET YOUR REQUIREMENTS.

Unless you accept a license to use the AT&T Software, you shall not
reverse compile, disassemble or otherwise reverse engineer this
product to ascertain the source code for any AT&T Software.

(c) AT&T Corp. All rights reserved.  AT&T is a registered trademark of AT&T Corp.

***********************************************************************

History:

      24/11/99  - initial release by Hartmut Liefke, liefke@seas.upenn.edu
                                     Dan Suciu,      suciu@research.att.com
*/

//**************************************************************************
//**************************************************************************

// This module contains the Enumeration-Compressor 'e'
// The enumeration-compressor (or dictionary compressor) assigns a
// positive integer to each possible string occured. In the compressor,
// a dictionary of all previously seen strings is maintained and a hashtable
// allows the efficient lookup. A new
// string is assigned the next free number.

// The decompressor loads the dictionary and directly accesses the
// dictionary for a given integer.

// Note that there is *only one* hashtable for all enum-compressors !!
// I.e. all new strings are represented in the same hash tableindex number space !

#include "stdafx.h"

#ifdef NOTHREAD
void *EnumHashEntry::operator new(size_t size)
{
   return enumcompressmem->GetByteBlock(size);
}
#endif

char *EnumHashEntry::GetStrPtr() 
{  
	return dataptr;   
}

unsigned EnumHashTable::CalcHashIdx(char *str,int len)
   // Computes the hash index for a given string
{
   unsigned idx=len;
   while(len--)
   {
      idx=(idx<<3)+(idx>>29)+(idx>>(idx&1));
      idx+=(unsigned char)*str;
      str++;
   }
   return (idx+(idx>>ENUMHASHTABLE_SHIFT))&ENUMHASHTABLE_MASK;
}

EnumHashTable::EnumHashTable()
{
   Reset();
}

void EnumHashTable::Initialize()
   // The hash table is emptied
{
   if(!isinitialized)
   {
      for(int i=0;i<ENUMHASHTABLE_SIZE;i++)
         hashtable[i]=NULL;

      isinitialized=TRUE;
   }
}

void EnumHashTable::Reset()
   // This will cause the hash table to be emptied next time we try to
   // add elements
{
   isinitialized=FALSE;
}

EnumHashEntry *EnumHashTable::FindOrCreateEntry(char *str,int len,EnumCompressState *enumstate,char *isnew,MemStreamer *strmem)
   // Finds or creates a new hash entry
   // enumstate is the state for the compressor, *isnew will be set to 1 if a new entry has been created
   // strmem is the MemStreamer used for allocating string memory space
{
   // Let's determine the hash table index first
   unsigned       hashidx=CalcHashIdx(str,len);
   EnumHashEntry  **hashentryref=hashtable+hashidx;
   char           *ptr1,*ptr2;

   // Let's firstly look through the existing hash index list
   while(*hashentryref!=NULL)
   {
      if(((*hashentryref)->datalen==len)&&
         ((*hashentryref)->enumstate==enumstate))
      {
         ptr1=str;
         ptr2=(*hashentryref)->GetStrPtr();

         // Let's check whether the strings are equal
         if(mymemcmp(ptr1,ptr2,len)==0)
            // Did we find an entry? ==> We are done
         {
            *isnew=0;
            return *hashentryref;
         }
      }
      hashentryref=&((*hashentryref)->nextsamehash);
   }

   // None of the hash-entries matches ?
   // ==> We create a new one

   *hashentryref=new EnumHashEntry();

   (*hashentryref)->nextsamehash =NULL;
   (*hashentryref)->enumstate    =enumstate;

   // The length of the string is stored in 'strmem'
   strmem->StoreUInt32(len);

   // Now we allocate the number of bytes in MemStreamer 'strmem'
   (*hashentryref)->datalen      =len;
   (*hashentryref)->dataptr      =strmem->GetByteBlock(len);

   // Hence, 'strmem' contains a sequence of tuples (len,str) that allows us in the
   // decompressor to reconstruct the dictionary
   memcpy((*hashentryref)->dataptr,str,len);

   // The item is new
   *isnew=1;

   return *hashentryref;
}

EnumerationCompressor::EnumerationCompressor(): UserCompressor()
{
	Init();
}

EnumerationCompressor::EnumerationCompressor(Session *s): UserCompressor(s)
{
	Init();
}

void EnumerationCompressor::Init()
{
	datasize=sizeof(EnumCompressState);
   contnum=1;
   isrejecting=0;
   canoverlap=1;
   isfixedlen=0;
}

void EnumerationCompressor::InitCompress(CompressContainer *cont,char *dataptr)
   // Initializes the compressor: the specific enum state 'dataptr'
{
   ((EnumCompressState *)dataptr)->curidx=0; // We start at index 0

   // Let's initialize the MemStreamer -
   // this is the same as calling the constructor
   ((EnumCompressState *)dataptr)->stringmem.Initialize(session, 0);

   // The state is added to a list of states
   // This is necessary so that we can store the state information as soon
   // as the data is about to be compressed
   session->AddEnumCompressState((EnumCompressState *)dataptr);

   // We also need to initialize the hash table (if it hasn't been initialized before)
   session->enumhashtable->Initialize();
}

void EnumerationCompressor::CompressString(char *str,unsigned len,CompressContainer *cont,char *dataptr)
   // Compresses a specific string item
{
   char isnew;

   // Let's lookup the item in the hash table
   EnumHashEntry *entry=session->enumhashtable->FindOrCreateEntry(
      str,len,(EnumCompressState *)dataptr,
      &isnew,&(((EnumCompressState *)dataptr)->stringmem));
   
   if(isnew)   // Is this a new entry? ==> We need to assign a new index
   {
      entry->localidx=((EnumCompressState *)dataptr)->curidx;
      ((EnumCompressState *)dataptr)->curidx++;
   }
   // If the item is not new, then EnumCompressState::curidx is the
   // index of the corresponding string

   // We store the index of the item in the container
   cont->StoreCompressedUInt(entry->localidx);
}

void EnumerationCompressor::PrintCompressInfo(char *dataptr,unsigned long *overalluncomprsize,unsigned long *overallcomprsize)
   // Prints statistical information about how well the compressor compressed
   // the data
{
   unsigned long  uncompsize=((EnumCompressState *)dataptr)->uncompressed_size,
                  compsize=((EnumCompressState *)dataptr)->compressed_size;

   *overalluncomprsize+=uncompsize;
   *overallcomprsize+=compsize;

	if (session->settings->verbose >= XMILL_VERBOSE_ALL) {
		if(compsize!=0)
			printf("       Enum: %8lu ==> %8lu (%f%%)\n",
				uncompsize,compsize,
				100.0f*(float)compsize/(float)uncompsize);
		else
			printf("       Enum: %8lu ==> Small...\n",uncompsize);
	}
}

EnumerationUncompressor::EnumerationUncompressor(): UserUncompressor()
{
	Init();
}

EnumerationUncompressor::EnumerationUncompressor(Session *s): UserUncompressor(s)
{
	Init();
}

void EnumerationUncompressor::Init()
{
   datasize=sizeof(EnumUncompressState);
   contnum=1;
}

void EnumerationUncompressor::InitUncompress(UncompressContainer *cont,char *dataptr)
   // Initializes the compressor by simply retrieving the next
   // state from the list of states
{
   *(EnumUncompressState *)dataptr=*GetNextPossibleEnumUnCompressState();
}

EnumUncompressState *EnumerationUncompressor::GetNextPossibleEnumUnCompressState()
{
	return session->enumdcompressfactory->GetNextPossibleEnumUnCompressState();
}

void EnumerationUncompressor::UncompressItem(UncompressContainer *cont,char *dataptr,XMLOutput *output)
   // An item is decompressed by looking up the dictionary
{
   unsigned idx=cont->LoadUInt32();
   EnumDictItem *item=((EnumUncompressState *)dataptr)->itemarray+idx;

   output->characters((char *)item->dataptr,item->len);
}

EnumerationCompressorFactory::EnumerationCompressorFactory(Session *s, int comp): UserCompressorFactory(s, comp)
{
   enuminstancecount=0;
}

char *EnumerationCompressorFactory::GetName()         
{  
	return "e"; 
}
char *EnumerationCompressorFactory::GetDescription()  
{  
	return "Compressor for small number of distinct data values"; 
}
char EnumerationCompressorFactory::IsRejecting()      
{  
	return FALSE;   
}
char EnumerationCompressorFactory::CanOverlap()       
{  
	return TRUE;   
}

CompEnumerationCompressorFactory::CompEnumerationCompressorFactory(Session *s): EnumerationCompressorFactory(s, XMILL_COMP)
{
	enumcompress.SetSession(session);
   enumstatelist=NULL;
   lastenumstateref=&enumstatelist;
}

void CompEnumerationCompressorFactory::AddEnumCompressState(EnumCompressState *state)
   // Adds a new enumstate to the global list
{
   *lastenumstateref=state;
   lastenumstateref=&(state->next);
   state->next=NULL;
   enuminstancecount++;
}

UserCompressor *CompEnumerationCompressorFactory::InstantiateCompressor(char *paramstr,int len)
   // The instantiation simply return the one instance we have
{
   if(paramstr!=NULL)
   {
      XMillException *e = new XMillException(XMILL_ERR_ARGUMENTS, "Enumeration compressor 'e' should not have any arguments ('");
      e->ErrorCont(paramstr,len);
      e->ErrorCont("')");
		throw e;
		/* dummy return */
      return NULL;
   }
   return &enumcompress;
}

// The compression/decompression routines for the factory
// CompressFactories are also allowed to store status information
// in the compressed file ! The following procedure is used for
// compressing/decompressing this information:
// Small data (<1024Bytes) is stored in the header, while
// large data is stored in separate zlib-blocks in the output file
void CompEnumerationCompressorFactory::CompressSmallGlobalData(Compressor *compressor)
   // Compresses the small data
{
   MemStreamer       headmem(session);
   EnumCompressState *state=enumstatelist;

   // Let's store the number of enum compressor we have
   headmem.StoreUInt32(enuminstancecount);

   // For each state, we store the number of dictionary entries and
   // the size of the string memory
   while(state!=NULL)
   {
      headmem.StoreUInt32(state->curidx);
      headmem.StoreUInt32(state->stringmem.GetSize());

      state=state->next;
   }

   compressor->CompressMemStream(&headmem);

   // Next, we also store all dictionaries that are smaller than 'SMALLCOMPRESS_THRESHOLD'
   state=enumstatelist;

   while(state!=NULL)
   {
      if(state->stringmem.GetSize()<SMALLCOMPRESS_THRESHOLD)
         compressor->CompressMemStream(&(state->stringmem));
      state=state->next;
   }
}

void CompEnumerationCompressorFactory::CompressLargeGlobalData(Output *output)
   // Compresses the large dictionaries
   // Furthermore, we also release all the memory of all (also the small) dictionaries
{
   EnumCompressState *state=enumstatelist;
   Compressor        compressor(session->settings, output);
   unsigned long     idx=0,uncompressedsize;

   state=enumstatelist;

   while(state!=NULL)
   {
      // We keep the uncompressed size for the statistical output at the
      // end of the compression
      state->uncompressed_size=state->stringmem.GetSize();

      if(state->stringmem.GetSize()>=SMALLCOMPRESS_THRESHOLD)
      {
         compressor.CompressMemStream(&state->stringmem);
         compressor.FinishCompress(&uncompressedsize,&(state->compressed_size));
         // We store the compressed size in 'state->compressed_size'
      }
      else
         state->compressed_size=0;

      // Releases the memory of the dictionaries
      state->stringmem.ReleaseMemory(0);

      state=state->next;
      idx++;
   }

   // We reset the hash table
   // Note that the hash entries themselves are deleted separately by releasing
   // the memory of 'blockmem'
   session->enumhashtable->Reset();
   enumstatelist=NULL;
   lastenumstateref=&enumstatelist;
   enuminstancecount=0;
}

unsigned long CompEnumerationCompressorFactory::GetGlobalDataSize()
   // Determines how much memory we need for the dictionaries
   // This information is later used in the decompression to allocate
   // the appropriate amount of memory
{
   EnumCompressState *state=enumstatelist;
   unsigned long     size=0;

   while(state!=NULL)
   {
      size+=sizeof(EnumDictItem)*state->curidx+          // The size of the dictionary directoy
               WordAlignUInt(state->stringmem.GetSize());// The size of the string space (word-aligned)
      state=state->next;
   }
   return size;
}

DecompEnumerationCompressorFactory::DecompEnumerationCompressorFactory(Session *s): EnumerationCompressorFactory(s, XMILL_DECOMP)
{
	enumuncompress.SetSession(s);
   activeenumuncompressstates=0;
}

UserUncompressor *DecompEnumerationCompressorFactory::InstantiateUncompressor(char *paramstr,int len)
   // The instantiation simply return the one instance we have
{
   return &enumuncompress;
}

void DecompEnumerationCompressorFactory::UncompressSmallGlobalData(SmallBlockUncompressor *uncompressor)
{
   MemStreamer       headmem;
   unsigned long     idx=0,i,j;
   unsigned char     *srcptr,*ptr;
   EnumDictItem      *curitem;

   // First, let's extract the number of enum states
   enuminstancecount=uncompressor->LoadUInt32();

   // We allocate the space for the enum states
   enumuncompressstates=(EnumUncompressState *)enumcompressmem->GetByteBlock(sizeof(EnumUncompressState)*enuminstancecount);

   // For each state, we load the number of items and the size of the
   // string space
   for(i=0;i<enuminstancecount;i++)
   {
      enumuncompressstates[i].itemnum=uncompressor->LoadUInt32();
      enumuncompressstates[i].size=uncompressor->LoadUInt32();
   }

   // We align the main memory block of the decompressor
   session->WordAlignMemBlock();

   for(i=0;i<enuminstancecount;i++)
   {
      // Let's firstly load the small dictionaries
      if(enumuncompressstates[i].size<SMALLCOMPRESS_THRESHOLD)
      {
         // Load the data. Afterwards, 'srcptr' points to the corresponding memory
         srcptr=uncompressor->LoadData(enumuncompressstates[i].size);

         // Let's allocate the memory
         enumuncompressstates[i].strbuf=session->AllocateMemBlock(enumuncompressstates[i].size);
         session->WordAlignMemBlock();
         memcpy(enumuncompressstates[i].strbuf,srcptr,enumuncompressstates[i].size);

         ptr=enumuncompressstates[i].strbuf;

         // Let's now create the lookup array
         enumuncompressstates[i].itemarray=(EnumDictItem *)session->AllocateMemBlock(sizeof(EnumDictItem)*enumuncompressstates[i].itemnum);

         curitem=enumuncompressstates[i].itemarray;

         // We initialize the lookup array with pointers to the actual strings
         for(j=0;j<enumuncompressstates[i].itemnum;j++)
         {
            // Let's read the length first
				curitem->len=Load::UInt32(ptr);
            // Let's read the data
				curitem->dataptr=Load::Data(ptr,curitem->len);
            // 'ptr' is moved forward by these operations

            curitem++;
         }
         // The pointer does not match the predicted size?
         // ==> We have a problem.
         if(ptr!=enumuncompressstates[i].strbuf+enumuncompressstates[i].size)
            ExitCorruptFile();
      }
   }
   // THe number of initializes enum states is zero at the beginning
   // The number increases with each call of 'UnCompresssor::UncompressInit()'
   activeenumuncompressstates=0;
}

void DecompEnumerationCompressorFactory::UncompressLargeGlobalData(Input *input)
   // Uncompresses the large dictionaries
{
   unsigned long  i,j,tmpsize;
   unsigned char  *ptr;
   EnumDictItem   *curitem;

   Uncompressor   uncompressor(session);

   session->WordAlignMemBlock();

   for(i=0;i<enuminstancecount;i++)
   {
      if(enumuncompressstates[i].size>=SMALLCOMPRESS_THRESHOLD)
      {
         // Let's allocate the memory for the large block
         enumuncompressstates[i].strbuf=session->AllocateMemBlock(enumuncompressstates[i].size);
         session->WordAlignMemBlock();

         tmpsize=enumuncompressstates[i].size;

         // Let's do the actual uncompression
         if(uncompressor.Uncompress(input,enumuncompressstates[i].strbuf,&tmpsize))
            ExitCorruptFile();

         // Did we uncompress less data than expected? ==> Error
         if(tmpsize!=enumuncompressstates[i].size)
            ExitCorruptFile();

         ptr=enumuncompressstates[i].strbuf;

         // Let's now create the lookup array
         enumuncompressstates[i].itemarray=(EnumDictItem *)session->AllocateMemBlock(sizeof(EnumDictItem)*enumuncompressstates[i].itemnum);

         curitem=enumuncompressstates[i].itemarray;
         
         // We initialize the lookup array with pointers to the actual strings
         for(j=0;j<enumuncompressstates[i].itemnum;j++)
         {
            // Let's read the length first
				curitem->len=Load::UInt32(ptr);
            // Let's read the data
				curitem->dataptr=Load::Data(ptr,curitem->len);

            // 'ptr' is moved forward by these operations
            curitem++;
         }
         // The pointer does not match the predicted size?
         // ==> We have a problem.
         if(ptr!=enumuncompressstates[i].strbuf+enumuncompressstates[i].size)
            ExitCorruptFile();
      }
   }
}

EnumUncompressState *DecompEnumerationCompressorFactory::GetNextPossibleEnumUnCompressState()
   // Retrieves the next state from the sequence of states
   // The next call retrieves the next state and so on.
{
   activeenumuncompressstates++;
   return enumuncompressstates+activeenumuncompressstates-1;
}

void DecompEnumerationCompressorFactory::FinishUncompress()
   // Releases the memory after decompression
{
   for(unsigned long i=0;i<enuminstancecount;i++)
   {
      session->FreeMemBlock(enumuncompressstates[i].strbuf,enumuncompressstates[i].size);
      session->FreeMemBlock(enumuncompressstates[i].itemarray,sizeof(EnumDictItem)*enumuncompressstates[i].itemnum);
   }
}
