/*************************************************************
wos2bib reads a Web of Science savedrecs.txt file and converts it to 
  the BibTeX file format.  

Copyright (C) 2003  Ethan Gutmann

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
**************************************************************/

#include "wos2bib.h"

// read in a potentially multi line entry (Author or Title)
void readMulti(FILE* unit, char* output, char* joiner, int toLower)
{
  char line[HUGE_STR], cur[HUGE_STR];
  int pos, done;
  fpos_t curPos;
  char next;

  line[0]='\0';
  cur[0]='\0';
  output[0]='\0';

  fgets(line, HUGE_STR, unit);
  stripnewline(line);
  substr(output, line, 1, strlen(line));

  if (toLower == 1) {lowerAuthor(output);}
 // find >2 letter words within the current line and stringToLower them

  pos=fgetpos(unit, &curPos);

  next=getc(unit);
  if (next != ' ') {fsetpos(unit, &curPos); return;}

  done=FALSE;
  while (!done && !feof(unit)){
    fgets(line, HUGE_STR, unit);
    stripnewline(line);
    substr(cur, line, 2, strlen(line));

    if (toLower == 1) {lowerAuthor(cur);}
 // add code to find >2 letter words within the current line and stringToLower them

    sprintf(output, "%s%s%s", output, joiner, cur);
    //strcat(output, joiner);
    //strcat(output, cur);

    pos=fgetpos(unit, &curPos);
    next=getc(unit);
    if (next != ' ') 
      done=TRUE;
  }
  // return the file pointer to where it was BEFORE we read the last character
  fsetpos(unit, &curPos);
}  

// similar to stringToLower, but only act on words longer than two letters
void lowerAuthor(char* s)
{
  long i, len, curlen=0;
  
  len = strlen(s);
  curlen=(long)((strchr(s, ' ')<strchr(s, ','))?strchr(s,' '):strchr(s,',')-s);
  if (curlen > 2) s[1]=tolower(s[1]);

  for( i = 2; i < len; i++ ){
    s[i] = tolower(s[i]);
    if ((s[i]=='-') || (s[i] == ' ')) {
      i++;  // skip 2 letters following spaces and "-"
      curlen=(long)((strchr(s, ' ')<strchr(s, ','))?strchr(s,' '):strchr(s,',')-&(s[i]));
      
      if (curlen < 3) i++;
    }
  }
}

// added by Maarten Sneep
// Translate a string to lower case. Is used to generate the key.
void stringToLower( char *s )
{
    long i, len;
    
    len = strlen(s);
    
    for( i = 0; i < len; i++ ){
      s[i] = tolower(s[i]);
      if ((s[i]=='-') || (s[i] == ' ')) i++;  // skip letters following spaces and "-"
    }
}

// only convertrs string to Lower case if there are no lowercase letters in it already
void smartToLower(char *s)
{
    long i, len;
    
    len = strlen(s);

    for( i = 0; i < len; i++ ){ if (islower(s[i])) return;}
    
    for( i = 0; i < len; i++ ){
      s[i] = tolower(s[i]);
      if ((s[i]=='-') || (s[i] == ' ')) i++;  // skip letters following spaces and "-"
    }
}


void gotoEndofRec(FILE* unit){
  char line[LARGE_STR];
  fgets(line, LARGE_STR, unit);

  while (!feof(unit) && strcmp(line, "ER\n"))
    {
      fgets(line, LARGE_STR, unit);
    }
} 

//Simple procedure to read a bibRec from an input file
void GetNextRecord(FILE* unit, bibRec* curRecord)
{
  char line[LARGE_STR], curWork[LARGE_STR], 
    bpage[SHORT_STR], epage[SHORT_STR];
  int line_size, done;
  char type;

  // reset all records
  (*curRecord).key[0]='\0';
  (*curRecord).pub[0]='\0';
  (*curRecord).year[0]='\0';
  (*curRecord).title[0]='\0';
  (*curRecord).author[0]='\0';
  (*curRecord).abstract[0]='\0';
  (*curRecord).keywords[0]='\0';
  (*curRecord).pages[0]='\0';
  (*curRecord).volume[0]='\0';
  (*curRecord).issue[0]='\0';
  bpage[0]='\0';
  epage[0]='\0';
  line[0]='\0';
  curWork[0]='\0';

  done=FALSE;
  // read the first line should begin with "PT" for Publication Type
  fgets(line, LARGE_STR, unit);
  //  printf("%s",line);
  substr(curWork, line, 0, 2);
  curWork[2]='\0';
  //  printf("%s\n", curWork);
  while (!feof(unit) && strcmp(curWork, "PT")) 
    {
      curWork[0]='\0';
      fgets(line, LARGE_STR, unit);
      substr(curWork, line, 0, 2);
      curWork[2]='\0';
      //      printf("%s\n", curWork);
    }
  curWork[0]='\0';

  //line_size=fscanf(unit, "%s", line);
  substr(curWork, line, 3, 1);
  curWork[1]='\0';

  // if this isn't a Journal entry we don't know how to deal with it
  //   (and we may not deal with all of those well)
  if (strcmp(curWork, "J"))
    {
      if (!feof(unit)){
	printf("I don\'t work with non-Journal entries yet\n   %s\n", 
	       curWork);
      }
      gotoEndofRec(unit);
      return;
    }
  //  printf("--%s--\n",curWork);
  (*curRecord).pub_type = ARTICLE;

  // while we haven't reached an "ER" line and we haven't reached
  // the end of the file read a new line and parse it into the 
  // curRecord data structure.  
  while (!done && !feof(unit)) 
    {
      line_size=fscanf(unit, "%c", &type);
      switch (type) {


      // read in the AUthor line(s) or the ABstract
      case 'A':
        fscanf(unit, "%c", &type);
        if (type=='U'){
          readMulti(unit, (*curRecord).author, " and ",1);
          break;
        }
        else if (type=='B'){
          readMulti(unit, (*curRecord).abstract, " ",0);
        }
        else if (type=='R') { // by Florian: the new AR field
          // i.e. beginning page number.
          // This is designed to work for the new style of
          // Physical Review journals, where the ARTICLE number
          // effectively replaces the page number (pages
          // are numbered AR-1, AR-2, etc.)
	  // revised by Hermann Schier
          if (!strncmp((*curRecord).pub, "PHYSICAL REVIEW", 15)){
	    fgets(line, LARGE_STR, unit);
	    stripnewline(line);
	    substr(bpage, line, 1, strlen(line));
	  }
	  else {
	    fgets(line, LARGE_STR, unit);
	  }
	}
        break;
      // Read in the TItle line(s)
      case 'T':
	fscanf(unit, "%c", &type);
	if (type =='I'){
	  readMulti(unit, (*curRecord).title, " ",0);
	}
	// Times cited ignored for now.  
	//	if (type=='C'){
	//	}
	break;
      // Read in the SOurce publication (journal name)
      case 'S':
	fscanf(unit, "%c", &type);
	if (type =='O'){  // avoids the occasional SU
	  fgets(line, LARGE_STR, unit);
	  stripnewline(line);
	  substr((*curRecord).pub, line, 1, strlen(line));
	}
	break;

      // Read in the Beginning Page number
      case 'B':
	fgets(line, LARGE_STR, unit);
	stripnewline(line);
	substr(bpage, line, 2, strlen(line));
	break;

      // Read in the Ending Page number also checks to make sure 
      //   we didn't hit the End of the Record (ER)
      case 'E':
	fscanf(unit, "%c", &type);
	fgets(line, LARGE_STR, unit);
	stripnewline(line);
	if (type == 'R') {           // ER=End Record, so we are done
	  //fscanf(unit, "%s", line);  // read past the next (blank) line
	  fgets(line, LARGE_STR, unit);
	  done=TRUE;break;} 

	// not done, and must read the End Page
	substr(epage, line, 1, strlen(line));
	break;


      // read in the Publication Year if this is not PY then ignore it
      case 'P':
	fscanf(unit, "%c", &type);
	fgets(line, LARGE_STR, unit);
	stripnewline(line);
	if (type == 'Y'){  //PY = Publication Year
	  substr((*curRecord).year, line, 1, strlen(line));
	}
	else if (type =='D'){//PD = Publication Date
	  substr((*curRecord).month, line, 1, 3);
	}
	break;

      // read in the volume number
      case 'V':
	fgets(line, LARGE_STR, unit);
	stripnewline(line);
	substr((*curRecord).volume, line, 2, strlen(line));
	break;
	
      // read in the issue number or Keywords
      case 'I':
	curWork[0]='\0';
	line[0]='\0';
	fscanf(unit, "%c", &type);
	if (type == 'D'){  //Keywords
	  readMulti(unit, curWork, " ",0);
	  //printf("%s\n", (*curRecord).keywords);
	  //printf("%s\n", curWork);
	  //	  readMulti(unit, (*curRecord).keywords, " ",0);
	  sprintf(line, "%s; %s", curWork, (*curRecord).keywords);
	  (*curRecord).keywords[0]='\0';
	  strcpy((*curRecord).keywords, line);
	  curWork[0]='\0';
	  line[0]='\0';
	  break;
	}
	// else it must be an issue number
	fgets(line, LARGE_STR, unit);
	stripnewline(line);
	substr((*curRecord).issue, line, 1, strlen(line));
	break;
      case 'D':
	fscanf(unit, "%c", &type);
	if (type =='E'){
	  readMulti(unit, (*curRecord).keywords, " ",0);
	}
	break;
      default :
	fgets(line, LARGE_STR, unit);
      }
    }

  // if we read both the beginning page number and the end page number
  // then we concatenate the two as the pages element
/* Maarten Sneep:
 * if the begin and end page are not empty: make a reference with both begin and end,
 * otherwise use just the begin page.
 */
  if (bpage && epage) {
      if ( strlen(bpage) > 0 && strlen(epage) > 0 )
          sprintf((*curRecord).pages, "%s--%s", bpage, epage) ;
      else if( strlen(bpage) > 0 )
          sprintf((*curRecord).pages, "%s", bpage ) ;
  }

  sscanf((*curRecord).author, "%s", line);
  stripnewline(line);  // actually using this to strip off the comma
  stringToLower(line); // make name lowercase, before creating the key.

  smartToLower(&((curRecord->keywords)[1]));
  smartToLower(&((curRecord->pub)[1]));
  // this is now done within readMulti, though it should still be done here!
  //  stringToLower(&((curRecord->author)[1]));
  smartToLower(&((curRecord->title)[1]));
  smartToLower(&((curRecord->month)[1]));
  // print the key
  sprintf((*curRecord).key, "%s%s", line, (*curRecord).year);
}

// initializes the file pointer for the input file
// reads the first two lines and verifies that the first line
//   is "FN ISI Export Format"
void initRead(FILE* unit)
{
  char line[LARGE_STR];
//  int line_size;

  // read the first line
  fgets(line, LARGE_STR, unit);

  // check to make sure the first line is what is should be
  printf("%s",line);
  if (strcmp(line, "FN ISI Export Format\n"))
    {
      printf("ERROR reading input file\n     %s\n", line);
      return;
    }
  
  // read the second line, should be "VR 1.0" 
  fgets(line, LARGE_STR, unit);
  printf("%s\n",line);
}


// writes each record to disk
void writeRecord(FILE* unit, bibRec* curRecord)
{
  char line[LARGE_STR];
  line[0]='\0';
  
  if (strlen((*curRecord).author) < 1)
    return;
  
/* Maarten Sneep:
 * a field is only printed when it isn't empty.
 * To prevent spurious commas in the bib file, the comma is printed by the next entry 
 * so that at any time the item is ready to be closed.
 */
 
  // print the key
  fprintf(unit, "@article { %s", (*curRecord).key);

  if ((*curRecord).pub && strlen((*curRecord).pub) > 0 ) 
    fprintf(unit, ",\n     Journal = {%s}", (*curRecord).pub );
  if ((*curRecord).year && strlen((*curRecord).year) > 0 ) 
    fprintf(unit, ",\n     Year = {%s}", (*curRecord).year );
  if ((*curRecord).title && strlen((*curRecord).title) > 0 ) 
    fprintf(unit, ",\n     Title = {%s}", (*curRecord).title );
  if ((*curRecord).author && strlen((*curRecord).author) > 0 ) 
    fprintf(unit, ",\n     Author = {%s}", (*curRecord).author );
  if ((*curRecord).keywords && strlen((*curRecord).keywords) > 0 )
    fprintf(unit, ",\n     Keywords = {%s}", (*curRecord).keywords );
  if ((*curRecord).abstract && strlen((*curRecord).abstract) > 0 ) 
    fprintf(unit, ",\n     Abstract = {%s}", (*curRecord).abstract );
  if ((*curRecord).pages && strlen((*curRecord).pages) > 0 )
    fprintf(unit, ",\n     Pages = {%s}", (*curRecord).pages );
  if ((*curRecord).volume && strlen((*curRecord).volume) > 0 ) 
    fprintf(unit, ",\n     Volume = {%s}", (*curRecord).volume );
  if ((*curRecord).issue && strlen((*curRecord).issue) > 0 ) 
    fprintf(unit, ",\n     Issue = {%s}", (*curRecord).issue);
  if ((*curRecord).month && strlen((*curRecord).month) > 0 ) 
    fprintf(unit, ",\n     Month = {%s}", (*curRecord).month );

  fprintf(unit, "\n}\n\n");
}    


// print usage information.  This is called if the user
// did not pass any command line arguements to wos2bib
void usage( void )
{
  printf("\n");
  printf("------------wos2bib version 0.5.4-------------\n");
  printf("\n");
  printf(" use : wos2bib <inputfile> [outputfile]\n");
  printf("\n");
  printf("   If the output file is not specified then \n");
  printf("      output file will be <inputfile>.bib\n");
  printf("\n");
  printf("   Data will be appended to the end of the output\n");
  printf("      file if it already exists.  \n");
  printf("\n");
}


void fixKeys(bibRec *curRecord)
{
  char* thisKey=curRecord->key;
  char addition='b';
  int len=strlen(thisKey);

  curRecord=(bibRec*) curRecord->nextRec;

  while (curRecord != NULL){
    if (!strcmp(curRecord->key, thisKey)){
      curRecord->key[len]=addition++;
      curRecord->key[len+1]='\0';
    }
    curRecord=(bibRec*) curRecord->nextRec;
  }
  if (addition !='b'){
    thisKey[len]='a'; 
    thisKey[len+1]='\0';
  }    
}


// main program
int main(int argc, char* argv[])
{
  char infile[LARGE_STR];
  char outfile[LARGE_STR];
  FILE *un;
  FILE *oun;
  bibRec *curRecord;
  bibRec *tmpRecord;
  bibRec *firstRecord;

  // check input information
  if ((argc < 2)){// || (!ftest(argv[1]))) {
    usage();
    return(2);
  }

  // setup file names
  strcpy(infile, argv[1]);  // input filename

  if (argc == 3) {             //IF we were given an output filename
    strcpy(outfile, argv[2]);  // then save that name
  }
  else                         //ELSE create an output filename by appending .bib
    sprintf(outfile, "%s.bib", infile);  // to the input filename

  // open input and output files
  un=fopen(infile, "r");      //open for reading
  oun=fopen(outfile, "a");    //open for writing, append to end of file

  //read past the header and make sure the format appears correct
  initRead(un);
  
  firstRecord=(bibRec*)malloc(sizeof(struct bibTeX_Rec));
  tmpRecord=firstRecord;
  curRecord=firstRecord;

  // loop through the input file reading a new record
  while (!feof(un)) {
    tmpRecord=curRecord;
    GetNextRecord(un, curRecord);             //reads an input file 
    curRecord=(bibRec*)malloc(sizeof(struct bibTeX_Rec));
    tmpRecord->nextRec=curRecord;
  }
  free(curRecord);
  tmpRecord->nextRec=NULL;

  // Loop through the records we just read writing them to disk in BibTeX format
  //   and freeing up the memory.  
  curRecord=firstRecord;
  while (curRecord->nextRec != NULL) {
    fixKeys(curRecord);
    printf("%s\n", curRecord->key);
    tmpRecord=(bibRec*) curRecord->nextRec;
    writeRecord(oun, curRecord); //writes to the output file
    free(curRecord);
    curRecord=tmpRecord;
  }
 
  fclose(un);
  fclose(oun);
  return(0);
}

