/* $Id: cctspace.c,v 1.3 2004/11/26 11:41:32 zlb Exp $ */

/* This code is an auxiliary tool. It inserts '~', or ' ' between
 * GBK and ASCII characters. For example, "ASCIIַļ"
 * is transformed into "~ASCII ַļ". The purpose of
 * the program is to help users to convert their old .ctx files.
 */
#define VERSION "0.6.0.3"

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

#if defined(WIN32) && defined(_DEBUG)
#	define DEBUG
#endif

#ifdef MIKTEX
   /* FIXME: not tested */
#  include "miktex.h"
#  include "gnu-miktex.h"
#  include "web2c-miktex.h"
#elif defined(USE_KPSE)
#  include <kpathsea/c-auto.h>
#  include <kpathsea/kpathsea.h>
#else
#  include "fullpath.h"
#endif

static int force_tilde = 0;
static int verbose = 0;

static FILE *fin, *fout;
#define PUTCHAR(c) fputc(c, fout)

/* We add '\0' at the two side of the buffer to avoid buffer overflow
 * in MatchParenthesis */
#define TRUNC 16384
static unsigned char RealBuffer[2 + (3*TRUNC < 2048 ? 2048 : 3*TRUNC)];
#define BUFFERLEN (sizeof(RealBuffer) -2)
static unsigned char *start, *end, *buffer = RealBuffer + 1;

/* No space (or '~') will be added at the right of chars in ASCII_left and
 * GBK_left, or at the left of ASCII_right and GBK_right
 */
typedef struct {
    unsigned char	*string;
    int			flag;		/* 0: normal, 1: transparent */
} token_t;

typedef struct {
    token_t		*tokens;
    size_t		size;
    size_t		limit;
    size_t		*index;
} tokenlist_t;
#define TOK0 {NULL, 0, 0, NULL}

static tokenlist_t Left = TOK0, Right = TOK0, Verbatim = TOK0,
		Verbatim_env = TOK0, Verb = TOK0;

static unsigned int MaxLen = 0;

#define isGBK(c)	(c > 0x80 && c < 0xff)

static void AddToken(tokenlist_t *list, unsigned char *str, int reverse,
		int flag)
{
    size_t len;
    unsigned char *p, *q;

    if (list->size >= list->limit) {
	list->tokens = realloc(list->tokens,
			((list->limit) += 64) * sizeof(token_t));
	if (list->tokens == NULL) {
    mem_err:
	    fprintf(stderr, "Cannot allocate memory block, abort.\n");
	    exit(1);
	}
    }
    len = strlen(str);
    list->tokens[list->size].flag = flag;
    if ((list->tokens[list->size].string = malloc(len + 1)) == NULL)
	goto mem_err;
    if (MaxLen < len) MaxLen = len;
    if (!reverse)
	strcpy(list->tokens[(list->size)++].string, str);
    else {
	for (p=str+len-1, q=list->tokens[(list->size)++].string; p>=str;)
	*(q++) = *(p--);
	*q = '\0';
    }
}

static int TokenComp0(const void *t1, const void *t2)
/* shorter string is considered smaller */
{
    token_t *tok1 = (token_t *)t1, *tok2 = (token_t *)t2;
    int i;

    i = strlen(tok1->string) - strlen(tok2->string);
    if (i) return i;
    return strcmp(tok1->string, tok2->string);
}

void InitIndex(tokenlist_t *list)
{
    unsigned int i, j;

    list->index = malloc((MaxLen + 1) * sizeof(list->index));
    if (list->index == NULL) {
	fprintf(stderr, "Memory allocation error.\n");
	exit(1);
    }

    (list->index)[0] = 0;
    j = 0;
    for (i = 1; i < MaxLen; i++) {
	while (j < list->size && strlen(list->tokens[j].string) <= i) j++;
	if (j >= list->size) break;
	(list->index)[i] = j;
    }
    while (i <= MaxLen) (list->index)[i++] = list->size;
}

static void init(int argc, char **argv)
{
    FILE *f;
    unsigned char *str=buffer, *p;
    int current = 0;	/* 0: none, 1: left, 2: right, 3: common,
			   4: verbatim, 8: transparent */

#ifdef MIKTEX
    /* FIXME: not tested */
    miktex_initialize ();
    if (!miktex_find_app_input_file("cct", "cctspace.cfg", p = str)) {
#else
#  ifdef USE_KPSE
    kpse_set_program_name(argv[0], "cctspace");
    p = kpse_find_file("cctspace.cfg", kpse_tex_format, 1);
#  else
    /* first check current directory, then fullpathname of .exe file
     * replacing .exe with .cfg */
    if ((f = fopen(p = "cctspace.cfg", "rt")) != NULL) fclose(f); 
    else {
	static unsigned char path[1024];
	strcpy(path, fullpath(argv[0]));
	p = path + strlen(path) - 1;
	while (p > path && *p != '.' && *p != '/' && *p != '\\' && *p != ':')
	    p--;
	((*p == '.') ? strcpy : strcat)(p, ".cfg");
	p = path;
    }
#  endif
    if (p == NULL) {
#endif
	fprintf(stderr, "Cannot find \"cctspace.cfg\", abort.\n");
	exit(1);
    }
    if (verbose) fprintf(stderr, "Reading \"%s\" ...\n", p);
    if ((f = fopen(p, "rt")) == NULL) {
	fprintf(stderr, "Cannot open \"%s\", abort.\n", p);
	exit(1);
    }

    while (1) {
	if (fscanf(f, "%s", str) != 1) break;
	if ((p=strchr(str, '%')) != NULL) {
	    if (p == str || *(p-1) != '\\') {
		while (1) {
		    int c;
		    if ((c = fgetc(f)) == EOF) break;
		    if (c == '\n') break;
		}
		*p = '\0';
		if (p == str) continue;
	    }
	}
	if (!strcmp(str, "COMMON:")) {
	    current = 3;
	    continue;
	}
	else if (!strcmp(str, "LEFT:")) {
	    current = 1;
	    continue;
	}
	else if (!strcmp(str, "RIGHT:")) {
	    current = 2;
	    continue;
	}
	else if (!strcmp(str, "VERBATIM:")) {
	    current = 4;
	    continue;
	}
	else if (!strcmp(str, "TRANSPARENT:")) {
	    current = 8;
	    continue;
	}
	else if (!strcmp(str, "VERBATIM_ENV:")) {
	    current = 16;
	    continue;
	}
	else if (!strcmp(str, "VERB:")) {
	    current = 32;
	    continue;
	}
	if (!current) {
	    fprintf(stderr, "Syntax error in the config file, abort.\n");
	    fclose(f);
	    exit(2);
	}
	if (current & 1) AddToken(&Left,	str, 1, 0);
	if (current & 2) AddToken(&Right,	str, 0, 0);
	if (current & 4) AddToken(&Verbatim,	str, 0, 0);
	if (current & 8) {
	    AddToken(&Left,	str, 1, 1);
	    AddToken(&Right,	str, 0, 1);
	}
	if (current & 16) AddToken(&Verbatim_env, str, 0, 0);
	if (current & 32) AddToken(&Verb, str, 0, 0);
    }
    fclose(f);

    qsort(Left.tokens,  Left.size,  sizeof(token_t), TokenComp0);
    qsort(Right.tokens, Right.size, sizeof(token_t), TokenComp0);
    qsort(Verbatim.tokens, Verbatim.size, sizeof(token_t), TokenComp0);
    qsort(Verbatim_env.tokens, Verbatim_env.size, sizeof(token_t), TokenComp0);
    qsort(Verb.tokens, Verb.size, sizeof(token_t), TokenComp0);

    InitIndex(&Left);
    InitIndex(&Right);
    InitIndex(&Verbatim);
    InitIndex(&Verbatim_env);
    InitIndex(&Verb);

#if 0
    {
	int i;

	fprintf(stderr, "MaxLen = %d\n", MaxLen);

	fprintf(stderr, "Left:\n");
	for (i = 0; i < Left.size; i ++)
	    fprintf(stderr, "\ti=%d, token=%s\n", i, Left.tokens[i].string);

	fprintf(stderr, "Left Index:\n");
	for (i = 0; i <= MaxLen; i++) fprintf(stderr, "\t%d\n", Left.index[i]);
    }
#endif
}

static char *MatchParenthesis(char *str, int step)
/* find matching parenthesis */
{
    int nest = 1;
    char c, left, right;

    left = *str;
    switch (left) {
	case '{' :	right = '}';	break;
	case '}' :	right = '{';	break;
	case '[' :	right = ']';	break;
	case ']' :	right = '[';	break;
	case '(' :	right = ')';	break;
	case ')' :	right = '(';	break;
	case '`' :	right = '\'';	break;
	case '\'' :	right = '`';	break;
	default: 	return NULL;
    }
    
    while ((c = *(str += step)) != '\0') {
	if (c == left) nest++;
	else if (c == right) if (! --nest) return str;
    }
    return NULL;
}

static int TokenComp(const void *t1, const void *t2)
{
    token_t *tok1 = (token_t *)t1, *tok2 = (token_t *)t2;
    return strcmp(tok1->string, tok2->string);
}

static int BSearch(unsigned char *str, int dir)
/* dir==0 --> search left, dir==1 --> search right */
{
    static token_t t = {NULL, 0};
    int len;
    size_t toklen;
    tokenlist_t *list;
    unsigned char *p, *q;

    if (t.string == NULL) {
        if ((t.string = malloc(MaxLen + 1)) == NULL) {
	    fprintf(stderr, "Cannot allocate memory, abort.\n");
	    exit(1);
	}
    }

    /* toksize is the minimum number of bytes to compare - 1 */
    toklen = 0;

    /* copy at most MaxLen bytes to `work' for comparing. Note ZLB: when
     * comparing to the left the characters in string is reversed */
    if (dir == 0) {
	unsigned char *tmpstr = str;
	/* check for TeX commands and drop the arguments ('{..}' and '[..]') */
	if (*tmpstr == '}' || *tmpstr == ']') {
	    p = tmpstr;
	    while (1) {
		if ((p = MatchParenthesis(p, -1)) == NULL) break;
		while (isspace(*(--p)));
		if (*p != '}' && *p != ']') break;
	    }
	    if (p != NULL && (isalpha(*p) || *p == '*')) {
		q = p;
		while (isalpha(*(--p)));
		if (*p == '\\') tmpstr = q;
	    }
	}
	list = &Left;
	len = MaxLen;
	if (len > tmpstr - buffer + 1) len = tmpstr - buffer + 1;
	p = tmpstr;
	q = t.string;
	while (p > tmpstr - len) *(q++) = *(p--);
	*q = '\0';
	/* compare at least 2 bytes if GBK char */
	if (toklen < 1 && isGBK(*start)) toklen = 1;
    }
    else {
	list = &Right;
	len = MaxLen;
	if (len > end - str) len = end - str;
	memcpy(t.string, str, len);
	t.string[len] = '\0';
	/* compare at least 2 bytes if GBK char */
	if (!isGBK(*start)) toklen = 1;
	/* set minimun size of string to compare to avoid matching
	 * a shorter token */
	if (*t.string == '\\' && len >1) {
	    while (++toklen < len && isalpha(t.string[toklen]));
	    if (toklen > 1) toklen--;
	}
    }

    for (; toklen < len; toklen++) {
	unsigned char c;
	int n = list->index[toklen + 1] - list->index[toklen];
	token_t *res;
	if (n <= 0) continue;
	c = t.string[toklen + 1];
	t.string[toklen + 1] = '\0';
	res = bsearch(&t, list->tokens + list->index[toklen], n,
			sizeof(token_t), TokenComp);
        if (res != NULL) {
	    if (res->flag == 0) return 1;
	    /* Transparent command */
	    if (dir == 0) {
		/* the case: "...}[...] GBK char ..." 
		 *                    ^ start	*/
		p = start;
		while (p!= NULL && p >= buffer && *p != '}')
		    if (*p == ']') p = MatchParenthesis(p, -1); else p--;
		if ((p - buffer >= 2 && isGBK(*(p - 2))) ||
		    isspace(*(p - 1)) || *(p - 1) == '~') return 1;
	    }
	    else {
		/* the case: "GBK char \xxxxx[...]{..." 
		 *            ^ start	*/
		p = start + 2;
		while (p!= NULL && p < end && *p != '{')
		    if (*p == '[') p = MatchParenthesis(p, 1); else p++;
		if (p == NULL) return 0;
		p++;
		if ((end - p >= 1 && isGBK(*p)) || isspace(*p) || *p == '~')
		    return 1;
	    }
	    return 0;
	}
	t.string[toklen + 1] = c;
    }

    return 0;
}

int main(int argc, char *argv[])
{
    unsigned char *p, *p0;
    int newlines, i;
    char *infile = NULL, *outfile = NULL;
    /* vars to record end verbatim strings */
    char *endv = NULL;
    int  endv_len = 0, endv_size = 0;

    RealBuffer[0] = '\0';

    for (i = 1; i < argc; i++) {
	if (argv[i][0] == '-' && argv[i][1] != '\0') {
	    if (argv[i][2] != '\0') goto usage;
	    switch (toupper(argv[i][1])) {
		case 'T':
		    force_tilde = 1;
		    break;
		case 'V':
		    verbose = 1;
		    break;
		case 'H':
		    /* falls through */
		default:
		    goto usage;
	    }
	}
	else if (infile == NULL) infile = argv[i];
	else if (outfile == NULL) outfile = argv[i];
	else {
    usage:
	    fprintf(stderr, "Usage: %s [options] [inputfile [outputfile]]\n",
			    argv[0]);
	    fprintf(stderr, "Options:\n");
	    fprintf(stderr, "\t-t\tforces inserting '~' instead of space.\n");
	    fprintf(stderr, "\t-v\tdisplays more messages.\n");
	    fprintf(stderr, "\t-h\tdisplays this help message.\n");
	    fprintf(stderr, "'-' used in place of a filename "
			    "stands for stdin or stdout.\n");
	    exit(1);
	}
    }

    if (verbose) {
	fprintf(stderr, "cctspace, v" VERSION ", 2003.\n");
    }

    fin = (infile == NULL || !strcmp(infile, "-")) ?
	    stdin : fopen(infile, "rt");
    if (fin == NULL) {
	fprintf(stderr, "Error: cannot open input file \"%s\".\n", infile);
	exit(2);
    }

    fout = (outfile == NULL || !strcmp(outfile, "-")) ?
	    stdout : fopen(outfile, "w+t");
    if (fout == NULL) {
	fprintf(stderr, "Error: cannot open output file \"%s\".\n", outfile);
	exit(3);
    }

    init(argc, argv);

#if 0
    /* Test BSearch() function */
    while (1) {
	unsigned char key[1024];

	printf("Enter key word: ");
	scanf("%s", key);

	strcpy(buffer, key);
	start = buffer;
	end = buffer + strlen(key);
	printf("Left:  %s\n", BSearch(end-1,0) ? "found" : "not found");
	printf("Right: %s\n", BSearch(start,1) ? "found" : "not found");
    }
#endif

    start = end = buffer;
    while (start < end || fin != NULL) {
	if (end - start < TRUNC) {
	    if (start - buffer > 2*TRUNC) {
		start -= TRUNC;
		end -= TRUNC;
		if (start < end) memcpy(buffer, buffer + TRUNC, end - buffer);
	    }
	    if (fin != NULL) {
		size_t len, len0 = BUFFERLEN - (end - buffer);
		len = fread(end, 1, len0, fin);
		if (len < len0) {
		    if (fin != stdin) fclose(fin);
		    fin = NULL;
		}
		end += len;
	    }
	    if (start >= end) break;
	    *end = '\0';
	}

	if (endv_len) {
	    /* We are in verbatim mode, check for end verbatim sequence */
	    if (end - start >= endv_len && !strncmp(start, endv, endv_len)) {
		/* end of verbatim mode (we don't output the last char 
		 * in the endverbatim string in order to check with the next
		 * char). */
		while (--endv_len) PUTCHAR(*(start++));
	    }
	    else {
		/* output verbatim char */
		if (isGBK(*start)) PUTCHAR(*(start++));
		PUTCHAR(*(start++));
	    }
	    continue;
	}

	/* check for user-defined char ('#[ddd]') */
	if (*start == '#' && *(start + 1) == '[' &&
		(start <= buffer || *(start - 1) != '\\')) {
	    char *p;
	    i = strtol(start + 2, &p, 10);
	    if (*p == ']' && i >= 0 && i < 3760) {
		/* user-defined char */
		fprintf(fout, "{\\ziti{@}%c%c}", (i/94)+176, (i%94)+161);
		start = p + 1;
		continue;
	    }
	}

	if (*start == '%' && (start <= buffer || *(start - 1) != '\\')) {
	    /* Skip comments (assuming linelength < TRUNC!) */
	    while (start < end) {
		PUTCHAR(*start);
		if (*(start++) == '\n') break;
	    }
	    continue;
	}

	if (*start == '\\') {
	    if (*(start+1) > 128) {
		/* Chinese char following '\\' */
		PUTCHAR(*(start++));
		continue;
	    }
			    
	    /* check for verbatim type environments */
	    if (end - start > 9 && !memcmp(start, "\\begin{", 7)) {
		if ((p = MatchParenthesis(start + 6, 1)) != NULL) {
		    void *res = NULL;
		    char save = *p;
		    int len = p - start - 7;
		    token_t t = {start + 7, 0};
		    *p = '\0';
		    if (len <= MaxLen)
			res = bsearch(&t,
				Verbatim_env.tokens + Verbatim_env.index[len-1],
				Verbatim_env.index[len] - 
					Verbatim_env.index[len-1],
				sizeof(token_t), TokenComp);
		    *p = save;
		    if (res != NULL) {
			endv_len = len + 6;
			if (endv_size < endv_len + 1)
			    endv = realloc(endv, endv_size = endv_len + 1);
			memcpy(endv, "\\end{", 5);
			memcpy(endv + 5, start + 7, len);
			strcpy(endv + 5 + len, "}");
			while (start < p) PUTCHAR(*(start++));
			continue;
		    }
		}
	    }

	    p = start;
	    while (p<end && isalpha(*(++p)));
	    if (p < end && *p == '*') p++;

	    /* check for \verb type commands */
	    if (p < end) {
		int len = p - start;
		void *res = NULL;
		char save = *p;
		token_t t = {start, 0};
		*p = '\0';
		if (len <= MaxLen)
		    res = bsearch(&t, Verb.tokens + Verb.index[len - 1],
				Verb.index[len] - Verb.index[len - 1],
				sizeof(token_t), TokenComp);
		*p = save;
		if (res != NULL) {
		    if (endv_size < 2)
			endv = realloc(endv, endv_size = 2);
		    endv_len = 1;
		    *endv = save;
		    endv[1] = '\0';
		    while (start <= p) PUTCHAR(*(start++));
		    continue;
		}
	    }
	    
	    /* Process commands in the `Verbatim' list */
	    p0 = p;
	    if (p < end && isspace(*p)) while (p < end && isspace(*(++p)));
	    if (p < end && *p != '{' && *p != '[') {
		/* check for special case: '\input file' */
		if (p0 - start == 6 && !strncmp(start, "\\input", 6)) {
		    while (start < p) PUTCHAR(*(start++));
		    while (p<end && !isspace(*p) && *p!='%') PUTCHAR(*(p++));
		    start = p;
		    continue;
		}
	    }
	    else if (p < end) {
		char c = *p0;
		void *res = NULL;
		token_t t = {start, 0};
		int len = p0 - start;
		
		*p0 = '\0'; 
		if (len <= MaxLen)
		    res = bsearch(&t, Verbatim.tokens + Verbatim.index[len - 1],
				Verbatim.index[len] - Verbatim.index[len - 1],
				sizeof(token_t), TokenComp);
		*p0 = c;
		if (res != NULL) {
		    p=MatchParenthesis(p, 1);
		    if (p != NULL && *p == ']') {
			/* skip optional argument */
			while (++p < end && isspace(*p));
			if (*p != '{')
			    p = NULL;
			else
			    p=MatchParenthesis(p, 1);
		    }
		    if (p != NULL) {
			while (start <= p) PUTCHAR(*(start++));
			continue;
		    }
		}
	    }
	}

	if (isspace(*start)) {
            while (start < end && isspace(*start)) PUTCHAR(*(start++));
	    continue;
	}
	if (end - start < 3) {
	    PUTCHAR(*(start++));
	    continue;
	}
	p = start + (isGBK(*start) ? 2 : 1);
	newlines = 0;
	while (p < end && isspace(*p)) {
	    if (*p == '\n') newlines++;
	    p++;
	}
	if (p == end || newlines > 1 || isGBK(*start) == isGBK(*p)) goto cont;
	/* The case below can only happen if the two characters are
	 * seperated by >TRUNC spaces */
	if (isGBK(*p) && p == end - 1) goto cont;
	/* -----------------------------------------------------------
	 * We now have the following sequence in the buffer:
	 *
	 *	  ... <char> ... [spaces] ... <char> ...
	 *		^			^
	 *		start			p
	 *
	 * And one of the two chars is a GBK char, the other one is
	 * an ASCII char.
	 * ----------------------------------------------------------- */
#if 0
		{
			unsigned char *q;
			fprintf(stderr, "==================================\n");
			for (q=start; q<p; q++) fprintf(stderr, "%c", *q);
			fprintf(stderr, "%c%c\n", *p, isGBK(*p) ? *(p+1) : ' ');
			fprintf(stderr, "start=%d, p=%d, end=%d\n",
				start-buffer, p-buffer, end-buffer);
		}
#endif
	/* Note: don't change value of `start' before calling BSearch!!! */
	if (isGBK(*start)) {
	    p0 = start + 1;
	    if (*p == '~') goto cont;
	    {
		/* don't insert '~' before a user-defined char */
		unsigned char *q = p;
		while (isspace(*q) && q < end) q++;
		if (q < end && *q == '#' && *(q + 1) =='[') goto cont;
	    }
	    /* check for "\ " */
	    if (*p == '\\' && p < end-1 && isspace(*(p+1))) goto cont;
	    if (BSearch(p0, 0) || BSearch(p, 1)) goto cont;
	    while (start < p) PUTCHAR(*(start++));
	    PUTCHAR('~');
	}
	else {
	    /* check if already spaces or '~' */
	    if (!force_tilde) {
		if (p > start + 1 || *start == '~') goto cont;
		if (BSearch(start, 0) || BSearch(p, 1)) goto cont;
		PUTCHAR(*(start++));
		PUTCHAR(' ');
	    }
	    else {
		if (*start == '~') goto cont;
		if (BSearch(start, 0) || BSearch(p, 1)) goto cont;
		PUTCHAR(*start);
		PUTCHAR('~');
		if (newlines) {
		    /* there must be exactly one newline between */
		    PUTCHAR('%');
		    PUTCHAR('\n');
		}
		/* remove spaces between */
		start = p;
	    }
	}
    cont:
	while (start < p) PUTCHAR(*(start++));
    }

    if (endv_len)
	fprintf(stderr, "Warning: file ended within verbatim mode.\n");

    if (fin != stdin && fin != NULL) fclose(fin);
    if (fout != NULL) {
	if (fout != stdout) fclose(fout); else fflush(fout);
    }
   

    exit(0);
}
