(file) Return to gbk2uni-hooklee.c CVS log (file) (dir) Up to [cvs] / cct / cct / Attic / gbk2uni-hooklee.c

  1 zlb   1.1 // gbk2uni.cpp : Transform GBK characters in .out file to unicode codes.
  2           //     the initial code is from out2uni in dvipdfmx project of KTUG
  3           // authors:  cxterm and Linbo Zhang in 2003
  4           // reach them at http://www.ctex.org
  5           // enhancer: hooklee (Shujun Li) in 2003
  6           // reach hooklee at http://www.hooklee.com or www.chinatex.org
  7           
  8           /////////////////////////////////////////////////////////////////////////
  9           //********************hyperref书签文件编码规则**************************
 10           //每个书签以如下形式存放 :\BOOKMARK [1][-]{section.0.1}{书签正文}{}
 11           //非unicode模式下使用hyperref宏包,bookmark中的部分特殊字符以\ooo的形式插入
 12           //' ':\040, '#':\043, '$':\044, '%':\045, '&':\046, '\':\134, '^':\136, '_':\137, '{':\173, '}':\175, '~':176
 13           //比较特殊的是'('和')',是以'\('和'\)'的形式插入的,而不是\ooo形式
 14           //\S:\247
 15           //所有其他字符和汉字均不作任何处理,在bookmark中保留
 16           //已经知道,这种保留会造成部分汉字在bookmark中无法显示
 17           //当使用\CJKchar{"0081}{"040}方式直接以GBK代码的方法插入汉字,bookmark中会生成如下的书签代码:
 18           //"0081"040,显然,pdflatex忽略了\CJKchar命令本身和前后的{}把参数当做普通文本做了转换
 19           //'^^xx^^yy'形式的CJK汉字在.out中有两种可能的出现方式:'^^xx^^yy'和'^^xxL'
 20           /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 21           //unicode模式下使用hyperref宏包,bookmark特殊字符均编码为\ooo\ooo或者\000x或者\000x\80y形式的unicode代码
 22 zlb   1.1 //书签内容均以\376\377开头作为前导标示符
 23           //经过实验,相应的bookmarkunicode代码插入规则如下:
 24           //*****A类:编码为\ooo\ooo的特殊字符部分*****
 25           //' '(空格):\000\040,使用\textvisiblespace也得到同样的书签
 26           //'#'(\#):\000\043, '$'(\$):\000\044, '%'(\%):\000\045, '&'(\&):\000\046
 27           //'(':\000\050; ')':\000\051
 28           //'\'(\textbackslash):\000\134;
 29           //'^'(\textasciicircum):\000\136; '_'(\_):\000\137
 30           //'{'(\{):\000\173; '}'(\}):\000\175
 31           //'~'(\textasciitilde):\000\176;
 32           //*****B类:编码为\000x的普通字符部分,其中x表示字符本身*****
 33           //abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789
 34           //|:',./!?;"-+=[]`*@(直接用@即可,无需\@)<(或\textless)>(或\textgreater)
 35           //*****C类:单个汉字*****
 36           //假设其高位码为H,低位码为L,则一般的GB汉字其插入形式为:\000H\80L
 37           //但是上述情况存在例外,当L为普通拉丁字符时,将会以\000HL的形式插入
 38           //如果任何汉字出现在一个低位为拉丁字符的GBK汉字之后,第二个汉字会以\80H\000L的形式出现
 39           //只有\80HL是不可能出现的汉字代码
 40           //当书签中包含多个汉字的时候,重复按照上述规则插入,汉字中间的其他字符按照正常规则插入
 41           /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 42           //@注意:当汉字低位字节为字符'}{~\_^'时,tex文档编译会出现错误,强行编译可能出现不可预测的行为
 43 zlb   1.1 //@插入out文件的内容变得很混乱,一般书签正文会在低位'}'出现之后终止,gbk2uni只尽可能地消除这种影响
 44           //@这可能使得部分GBK汉字在书签中消失或者显示为其他字符
 45           //@使用张林波老师随CCT新版发行的cctconv程序可以解决这个问题
 46           //@cctconv把汉字低位字节为'\', '{', '}', '^', '_', '~'的汉字低位字节分别改为'012345'以方便处理
 47           //@或者使用-f开关转换可以将所有高位为1的字符转换为^^xx的形式,这在一些老的不支持扩展字符的tex系统中有用
 48           //@gbk2uni处理这样的汉字假设cctconv已经运行(cctconv与CJK兼容,无需cct.sty即可得到正确的dvi文件)
 49           //@这样的汉字经cctconv处理后,插入out文件的对应内容在unicode模式下有两种可能:
 50           //@sprintf("\\000%d\\%03o", H, L)和sprintf("%d\\%03o", H, L),这里L已经是被转换回来的'}{~\_^'
 51           //现在还不清楚是否也有sprintf("\\80%d\\%03o", H, L)形式出现(根据'\80HL'不出现推测这种形式可能也不出现)
 52           /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 53           //*****D类:\CJKchar{"00ab}{"0cd}形式的CJK汉字*****
 54           //\000"\0000\0000\000a\000b\000"\0000\000c\000d
 55           //显然,unicode模式的hyperref是如下工作的:
 56           //第一步先生成非unicode模式的out文件,接着对其中的扩展字符做了一个后处理,但是这个处理对汉字不正确
 57           //*****E类:'^^xx^^yy'形式的CJK汉字*****
 58           //在unicode模式下,.out文件的内容一般为'\000^^xx\80^^yy'或者'\000^^xx\80L'
 59           //估计其他CJK汉字的样式也可能出现:'\80^^xx\000^^yy'、'\80^^xx\000L'、'\000^^xx^^yy'、'\000^^xx\ooo'
 60           //以上情况说明,在处理过程中,我们可以像TeX那样将每一个'^^xx'码字当成普通的ASCII字符来处理即可兼容'^^xx'代码
 61           /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 62           //如果\CJKchar{}{}中的前后两个参数位数不同,单从.out文件无法判断第二个参数何时结束
 63           //因此,gbk2uni要求在tex文档中统一采用三位十六进制法\CJKchar{"0xx"}{"0xx}表示前后两个参数
 64 zlb   1.1 //考虑到在实际中\CJKchar用的比较少,这个约定应该不会算大的限制。
 65           /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 66           
 67           #define VERSION "0.22"
 68           
 69           #include <stdio.h>
 70           #include <stdlib.h>
 71           #include <string.h>
 72           #include <ctype.h>
 73 zlb   1.1.1.2 #if defined(WIN32) && !defined(__MINGW32__)
 74 zlb   1.1     #  include <io.h>
 75               #  define PATH_MAX	_MAX_PATH+1
 76               #else
 77               #  include <unistd.h>
 78               #  include <limits.h>
 79               #endif
 80               
 81               #include "gbk2uni.h"
 82               
 83               #ifndef WIN32
 84               #  define _fileno(f) f
 85               static size_t _filelength(FILE *f)
 86               {
 87                   size_t pos = ftell(f);
 88                   size_t length;
 89               
 90                   fseek(f, 0, SEEK_END);
 91                   length = ftell(f);
 92                   fseek(f, pos, SEEK_SET);
 93                   return length;
 94               }
 95 zlb   1.1     #endif
 96               
 97               #if !defined(WIN32) && !defined(GO32)
 98               static int strnicmp (const char *s0, const char *s1, int n)
 99               {
100                   int i;
101                   while (n-- > 0 && *s0 != '\0' && *s1 != '\0') {
102               	i = toupper(*(s0++)) - toupper(*(s1++));
103               	if (i) return i;
104                   }
105                   return n > 0 ? toupper(*s0) - toupper(*s1) : 0;
106               }
107               #endif
108               
109               #define BYTE unsigned char
110               #define DWORD unsigned int
111               
112               //is a valid high byte of some GBK character
113               #define GBK_HIGH(h)  ((0x81<=(h&0xff)) && ((h&0xff)<=0xfe))
114               
115               //is a valid low byte of some GBK character
116 zlb   1.1     #define GBK_LOW(l)  ((0x40<=(l&0xff)) && ((l&0xff)<=0xfe))
117               
118               int		bLock = 0;//lock 
119               int		bUnlockOnly = 0;
120               int		bCJKchar = 1;//enable \CJKchar support defaultly, disable it with '-nc' option
121               //int		bIgnoreCJK7 = 0;
122               int		bParsingErrors = 1;
123               //int		bVerbose = 0;
124               int		bSilent = 0;
125               FILE   *Fout;
126               FILE   *Fin;
127               
128               void version(void)
129               {
130                 printf("gbk2uni, version "VERSION", initially implemented by cxterm and ZLB in Jan. 2003\n");
131                 printf("\t enhanced by hooklee in Mar. 2003.\n");
132                 printf("\t please visit www.ctex.org and www.chinatex.org for more information.\n");
133               }
134               
135               //print usage of gbk2uni
136               void usage(void)
137 zlb   1.1     {
138                 version();
139                 printf("Usage : gbk2uni [options] filename[.out] [options]\n");
140                 printf("Options:\n");
141                 printf("\t-u(-l)\t lock .out file to avoid overwritten in the next (pdf)latex run\n");
142                 printf("\t\t (.out file will be unlocked if no '-u' and '-l' options)\n");
143                 printf("\t-unlock\t unlock .out file without parsing .out file\n");
144               //  printf("\t-i\t ignore all CJK characters with \"^^xx^^yy\" format\n");
145                 printf("\t-s\t run gbk2uni silently (but errors remain)\n");
146                 printf("\t-cjk\t parse \\CJKchar{\"0xx}{\"0xx} command (default)\n");
147                 printf("\t-nocjk\t disable parsing \\CJKchar{\"0xx}{\"0xx} command\n");
148                 printf("\t-npe\t disable display of all parsing errors\n");
149               }
150               
151               /*
152               //write unicode into the file Fout
153               void putucode(unsigned int u)
154               {
155                 unsigned int h, l;
156               
157                 l = u & 0xff;
158 zlb   1.1       h = (u >> 8) & 0xff;
159               
160                 fprintf(Fout,"%c%03o%c%03o",'\\',h,'\\',l);
161               }*/
162               
163               //put a GBK code
164               void putGBKcode(BYTE h, BYTE l,int nLine)
165               {
166               	unsigned int u;
167               	unsigned int hu, lu;
168               
169               	if (!GBK_HIGH(h) || !GBK_LOW(l)) {
170               		//if current GBK character is not valid, it will be discarded
171               		if (bParsingErrors)
172               			fprintf (stderr, "An invalid GBK character is found:\n\tLine %d: ... 0x%x%x\n", nLine, h,l);
173               		return;
174               	}
175               
176               	u = gbk2uni[(h-0x81)*192 + (l-0x40)];
177               	lu = u & 0xff;
178               	hu = (u >> 8) & 0xff;
179 zlb   1.1     	fprintf(Fout,"\\%03o\\%03o",hu,lu);
180               }
181               
182               //is a character c '0'...'9','a'...'f','A'...'F'?
183               int is8digit(char c)
184               {
185               	return (c>='0' && c<='7') ? 1 : 0;
186               }
187               
188               //is a character c '0'...'9','a'...'f','A'...'F'?
189               int is16digit(char c)
190               {
191 zlb   1.1.1.2 	return (isdigit(c)) || (c>='a' && c<='f') || (c>='A' && c<='F') ? 1 : 0;
192 zlb   1.1     }
193               
194               //3-digit octal string to decimal number
195               unsigned char otoi(char *str)
196               {
197               	return 64*(*str-'0') + 8*(*(str+1)-'0') + (*(str+2)-'0');
198               }
199               
200               //2-digit hexadecimal string to decimal number
201               unsigned char xtoi(char *str)
202               {
203               	unsigned char h,l;
204               	h = (unsigned char)tolower(*str);
205               	l = (unsigned char)tolower(*(str+1));
206 zlb   1.1.1.2 	if (isdigit(h)) h = h - '0';
207 zlb   1.1     	else h = h - 'a' + 10;
208 zlb   1.1.1.2 	if (isdigit(l)) l = l - '0';
209 zlb   1.1     	else l = l - 'a' + 10;
210               	return 16*h + l;
211               }
212               
213               //parse '\000"\000x\000x' generated by \CJKchar{}{} command
214               //this function is used to skip the leading string '\000'
215               int getCJKchar(char **str,int nLine)
216               {
217               	int i;
218               
219               	while(**str!='\\' && **str != '\0' && **str != '}') (*str)++;//find the next '\\'
220               	if (**str == '}' || **str == '\0') return 0;
221               	(*str)++;
222               	for (i=0; i < 3; i++) {
223               		if(**str != '0') break;
224               		(*str)++;
225               	}
226               	if (i != 3) {
227               		if (bParsingErrors)
228               			fprintf (stderr, "An incomplete \\CJKchar{}{} command is found:\n\tLine %d: ... \"%s\"\n", nLine, (*str)-i-1);
229               		return -1;
230 zlb   1.1     	}
231               	return 1;
232               }
233               
234               //translate a '^^xx'-format TeX character to an ascii character
235               //if not a '^^xx'-format TeX character, return itself
236               int translateChar(char **str,int nLine)
237               {
238               	BYTE a;
239               
240               	if ( **str != '^') {
241               		a = **str;
242               		if ( **str != '}' && **str != '\0' ) (*str)++;
243               		return a;//if not '^^xx' directly return the current character
244               	}
245               
246               	while(**str == '^') (*str)++;//skip all '^' characters
247               	if ( is16digit(**str) && is16digit(*(*str+1)) ) {
248               		a = xtoi(*str);
249               		(*str) = *str + 2;
250               		return a;
251 zlb   1.1     	}
252               	else {
253               		if (bParsingErrors)
254               			fprintf (stderr, "An incomplete '^^xx' TeX character is found:\n\tLine %d: ... \"%s\"\n", nLine, (*str)-2);
255               		return -1;//-1L = 0xffffffff
256               	}
257               }
258               
259               //parse the bookmark and generate corresponding unicode codes
260               char *doparse(char *str,BYTE bUnicode,int nLine)
261               {
262               	BYTE	lh,h,l;//h denotes high byte and l denotes low byte of a unicode character,lh denotes the leading '\ooo'
263               	int		i, rtn;
264               	char	strCode[4];
265               
266               	//skip to the next valid character... needed or not?
267               	while(1) {
268               		if(bUnicode)
269               			while(*str==' ' || *str=='\t' || *str=='\n' || *str=='\r') str++;
270               		else
271               			while(*str=='\t' || *str=='\n' || *str=='\r') str++;
272 zlb   1.1     		
273               		if ( *str == '}' || *str == '\0') return str;//end
274               
275               		switch(*str) {
276               		case '\\':
277               			//original unicode codes generated by pdflatex, including '\(' and '\)'
278               			//note: '\oo' and '\par' may occur in wrongly-complied tex document
279               			while(*str == '\\') str++;//occasionally double '\' may occur in a wrong .out file
280               			//processing '\ooo' in non-unicode mode
281               			if(!bUnicode) {
282               				//processing '\(' and '\)' in non-unicode
283               				if ( *str == '(' || *str == ')') {
284               					fprintf(Fout,"\\000\\%03o", *str++);
285               					break;
286               				}
287               				//in non-unicode mode, '\ooo' is possible for special latin character, such as '\S'
288 zlb   1.1.1.2 				if (isdigit(*str)) {
289 zlb   1.1     					fprintf(Fout,"\\000\\");//add '\000' prefix to current special unicode character
290               					i=0;
291               					while(1) {
292               						fputc(*str++,Fout);i++;
293               						if(!isdigit(*str) || i >=3) break;
294               					}//to avoid less than three digital characters after '\'
295               					//fwrite(Fount,1,3,str);str+=3;
296               					if ( i < 3 && bParsingErrors)
297               						fprintf (stderr, "An incomplete special character is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-1);
298               				}
299               				break;
300               			}
301               			//processing '\ooo\ooo' or '\000x' or '\000H\80L' or '\000HL' or '\80H\000L' in unicode mode
302               			//here please note that either 'H' or 'L' or both two can be '^^xx'-format
303 zlb   1.1.1.2 			if (isdigit(*str)) {
304 zlb   1.1     				//in unicode mode, '\ooo\ooo' and '\000x' and '\000H\80L' are all possible for different characters
305               				//possibly, '\oo' should be taken into consideration to avoid possible collapse of gbk2uni
306               				strCode[0]=*str++;
307               				for (i=1; i<3; i++) {
308               					if(isdigit(*str)) strCode[i]=*str++;
309               					else break;
310               				}
311               				strCode[i]='\0';
312               				if (i == 1) {
313               					if (bParsingErrors)
314               						fprintf (stderr, "An incomplete special unicode code is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-1);
315               					break;
316               				}
317               				lh = atoi(strCode);//get the high byte of current unicode character
318               				if(i ==3 && *str == '\\') {//'\ooo\ooo': normal unicode character
319               					fprintf(Fout, "\\%s\\", strCode);//directly output leading '\ooo\'
320               					str++;
321               					for (i=0; i<3; i++) {
322               						if(isdigit(*str)) strCode[i]=*str++;//directly output the left 'ooo'
323               						else break;
324               					}
325 zlb   1.1     					strCode[i]='\0';
326               					if (i < 3) {
327               						if (bParsingErrors)
328               							fprintf (stderr, "An incomplete unicode code is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-1);
329               					}
330               					else fprintf(Fout, "%s", strCode);//directly output the left 'ooo'
331               					break;
332               				}
333               				if (lh == 0 && *str == '\"' && bCJKchar) {//\CJKchar{"0xx}{"0xx} command in unicode mode
334               					rtn = getCJKchar (&str, nLine);
335               					if(rtn == 0) return str;
336               					if(rtn == -1) break;//skip the first '\0000'
337               					rtn = getCJKchar (&str, nLine);
338               					if(rtn == 0) return str;
339               					if(rtn == -1) break;
340               					strCode[0] = *str++;//get the first digit of high byte
341               					rtn = getCJKchar (&str, nLine);
342               					if(rtn == 0) return str;
343               					if(rtn == -1) break;
344               					strCode[1] = *str++;//get the second digit of high byte
345               					h = xtoi (strCode);//get high byte
346 zlb   1.1     
347               					rtn = getCJKchar (&str, nLine);
348               					if(rtn == 0) return str;
349               					if(rtn == -1) break;
350               					if (*str != '\"') {//is the third unicode code '"'?
351               						if (bParsingErrors)
352               							fprintf (stderr, "An incomplete \\CJKchar{}{} command is found:\n\tLine %d: ... \"%s\"\n", nLine, str-4);
353               						break;
354               					}
355               					rtn = getCJKchar (&str, nLine);
356               					if(rtn == 0) return str;
357               					if(rtn == -1) break;//skip the second '\0000'
358               					rtn = getCJKchar (&str, nLine);
359               					if(rtn == 0) return str;
360               					if(rtn == -1) break;
361               					strCode[0] = *str++;//get the first digit of low byte
362               					rtn = getCJKchar (&str, nLine);
363               					if(rtn == 0) return str;
364               					if(rtn == -1) break;
365               					strCode[1] = *str++;//get the second digit of low byte
366               					l = xtoi (strCode);//get low byte
367 zlb   1.1     					putGBKcode(h, l, nLine);//put unicode code via GBK2UNICODE transformation
368               					break;
369               				}
370               				//'\000x' or '\000H\80L' or '\80H\000L' or '\000H\ooo'
371               				rtn = translateChar(&str,nLine);
372               				if (rtn == -1) break;//break when encountering errors
373               				else h = (BYTE) rtn;
374               				if (lh == 0 && h != 0 && h != '}' && h < 0x80) {//'\000x' format remains
375               					//translate '\000x' to '\000\ooo' to get more robust result
376               					fprintf(Fout, "\\000\\%03o", h);
377               					break;
378               				}
379               				if (lh == 0 && h > 0x80) {//'\000HL' or '\000H\80L' or '\000H\ooo'
380               					//h = (BYTE) *str++;//set high GBK byte
381               					if ( *str != '\\') {//'\000HL'
382               						rtn = translateChar(&str,nLine);
383               						if (rtn == -1) break;//break when encountering errors
384               						else l = (BYTE) rtn;
385               						//l = *str++;
386               						putGBKcode(h, l, nLine);//put unicode code via GBK2UNICODE transformation
387               					}
388 zlb   1.1     					else {//'\000H\80L' or '\000H\ooo'
389               						if (*(str+1) == '8' && *(str+2) == '0') {// is '80L' after '\'?
390               							str += 3;
391               							rtn = translateChar(&str,nLine);
392               							if (rtn == -1) break;//break when encountering errors
393               							else l = (BYTE) rtn;
394               							//l = *str++;//set low GBK byte
395               							putGBKcode(h, l, nLine);//put unicode code via GBK2UNICODE transformation
396               						}
397               						else if ( is8digit(*(str+1)) && is8digit(*(str+2)) && is8digit(*(str+3)) ) {
398               							l = otoi(str+1);
399               							putGBKcode(h, l, nLine);//put a GBK code
400               /*							if (l == '{' || l == '}' || l == '\\' || l == '^' || l == '_' || l == '~' || l == 0x80)
401               								putGBKcode(h, l, nLine);//put a GBK code
402               							else if (bParsingErrors)
403               								fprintf (stderr, "An invalid GBK character (in cctconv format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-2);
404               */							str += 4;
405               						}
406               						else if (bParsingErrors)
407               							fprintf (stderr, "An incomplete GBK character is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-1);
408               					}
409 zlb   1.1     					break;
410               				}
411               				if (lh == 80 && h > 0x80) {//'\80H\000L'
412               					//h = (BYTE) *str++;//set high GBK byte
413               					if (*str == '\\' && *(str+1) == '0' && *(str+2) == '0' && *(str+3) == '0') {// is '\000L' after '\80H'?
414               						str += 4;
415               						rtn = translateChar(&str,nLine);
416               						if (rtn == -1) break;//break when encountering errors
417               						else l = (BYTE) rtn;
418               						//l = *str++;//set low GBK byte
419               						putGBKcode(h, l, nLine);//put unicode code via GBK2UNICODE transformation
420               					}
421               					else if (bParsingErrors)
422               						fprintf (stderr, "An incomplete GBK character is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-1);
423               					break;
424               				}
425               				if (h == 80 && *str > 0) {//is '\80x' possible?
426               					if(*str != '}' && *str != '\0') str++;//goto the next code
427               				}
428               				break;
429               			}
430 zlb   1.1     			//remove '\par' from .out file
431               			if(*str == 'p' && *(str+1) == 'a' && *(str+2) == 'r') str+=3;
432               			break;
433               /*		case '^'://GBK characters with CJK format '^^xx^^yy'
434               			while(*str=='^') str++;//skip all '^' characters
435               			if (is16digit(*str) && is16digit(*(str+1))) {
436               				h = xtoi(str); str += 2;
437               				while(*str=='^') str++;//skip all '^' characters
438               				if (is16digit(*str) && is16digit(*(str+1))) {
439               					l = xtoi(str); str += 2;
440               					if (!bIgnoreCJK7) putGBKcode(h, l, nLine);//put a GBK code if not ignoring
441               				}
442               				else if (*str < 0)
443               				else if (bParsingErrors)
444               					fprintf (stderr, "An incomplete GBK character (in CJK format) is found:\n\tLine %d: ... \"^^%s\"\n", nLine, str-2);
445               			}
446               			else if (bParsingErrors)
447               				fprintf (stderr, "An incomplete GBK character (in CJK format) is found:\n\tLine %d: ... \"^^%s\"\n", nLine, str);
448               			break;
449               */		case '\"'://\CJKchar{"0xx}{"0xx} command in non-unicode mode?
450               			if (!bCJKchar) fprintf(Fout, "\\000\\%03o", *str++);//normal '"' character in non-unicode mode
451 zlb   1.1     			else {//\CJKchar{"0xx}{"0xx} command in non-unicode mode
452               				while(*str=='\"') str++;//skip all '"' characters
453               				if (*str == '0') str++;//skip the first '0'
454               				if (is16digit(*str) && is16digit(*(str+1))) {
455               					h = xtoi(str); str += 2;
456               					while(*str=='\"') str++;//skip all '"' characters
457               					if (*str == '0') str++;//skip the second '0'
458               					if (is16digit(*str) && is16digit(*(str+1))) {
459               						l = xtoi(str); str += 2;
460               						putGBKcode(h, l, nLine);//put a GBK code
461               					}
462               					else if (bParsingErrors)
463               						fprintf (stderr, "An incomplete GBK character (in \\CJKchar{}{} format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str-2);
464               				}
465               				else if (bParsingErrors)
466               					fprintf (stderr, "An incomplete GBK character (in \\CJKchar{}{} format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str);
467               			}
468               			break;
469               		default://normal characters in non-unicode mode or cctconv GBK characters in both mode
470               			    //or '^^xx^^yy'/'^^xxL' TeX characters
471               			rtn = translateChar(&str,nLine);
472 zlb   1.1     			if (rtn == -1) break;//break when encountering errors
473               			else h = (BYTE) rtn;
474               			if (h != 0 && h != '}' && h < 0x80) fprintf(Fout, "\\000\\%03o", h);//normal latin character
475               			else {//GBK character
476               				//h = *str++;//GBK high byte
477               				if (*str == '\\') {//characters generated by cctconv 
478               					str++;
479               					for(i = 0; i < 3; i++) {
480               						if(is8digit(*str)) strCode[i] = *str++;
481               						else break;
482               					}
483               					strCode[i] = '\0';
484               					if ( i != 3) {
485               						if (bParsingErrors)
486               							fprintf (stderr, "An incomplete GBK character (in cctconv format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-2);
487               						break;
488               					}
489               					l = otoi(strCode);//get low byte from '\ooo'
490               					putGBKcode(h, l, nLine);//put a GBK code
491               /*					if (l == '{' || l == '}' || l == '\\' || l == '^' || l == '_' || l == '~')
492               						putGBKcode(h, l, nLine);//put a GBK code
493 zlb   1.1     					else if (bParsingErrors)
494               						fprintf (stderr, "An invalid GBK character (in cctconv format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-2);
495               */					break;
496               				}
497               				//low byte of a normal CJK character or '^^yy'
498               				rtn = translateChar(&str,nLine);
499               				if (rtn == -1) break;//break when encountering errors
500               				else l = (BYTE) rtn;
501               				switch(l) {
502               					case '}':
503               					case '\0': 
504               						if (bParsingErrors)
505               							fprintf (stderr, "An incomplete GBK character (in '^^xx^^yy' format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str);
506               						return str;//SHOULD exit when reading '\0' or '}'
507               					case '0': l = '\\'; break;//reserved for future CCT
508               					case '1': l = '{';  break;//reserved for future CCT
509               					case '2': l = '}';  break;//reserved for future CCT
510               					case '3': l = '^';  break;//reserved for future CCT
511               					case '4': l = '_';  break;//reserved for future CCT
512               					case '5': l = '~';  break;//reserved for future CCT
513               					case '6': l = 0x80; break;//reserved for future CCT
514 zlb   1.1     					default: ;//normal GBK character or '^^yy'
515               				}
516               				putGBKcode(h, l, nLine);//put a GBK code
517               				//str++;
518               			}
519               		}
520               	}
521               }
522               
523               int main(int argc, char* argv[])
524               {
525                 char          inname[PATH_MAX]="";
526                 char          outname[PATH_MAX]="";
527                 char          bakname[PATH_MAX]="";
528                 char          *p;
529                 unsigned int	nLength;
530                 unsigned char *b_in,*b2_in,*b3_in;
531                 BYTE			bUnicode=0;
532                 int			nLine, i;
533               
534                 for(i=1; i < argc; i++) {
535 zlb   1.1     #if defined(WIN32) || defined(GO32)
536               	  strlwr(argv[i]);
537               #endif
538               	  if (!strcmp(argv[i], "-u") || !strcmp(argv[i], "-l")) {
539               		  bLock = 1; if (bUnlockOnly) bUnlockOnly = 0;
540               	  }
541               //	  else if (!strcmp(argv[i], "-i")) bIgnoreCJK7 = 1;
542               //	  else if (!strcmp(argv[i], "-v")) bVerbose = 1;
543               	  else if (!strcmp(argv[i], "-s")) bSilent = 1;
544               	  else if (!strcmp(argv[i], "-cjk")) bCJKchar = 1;
545               	  else if (!strcmp(argv[i], "-nocjk")) bCJKchar = 0;
546               	  else if (!strcmp(argv[i], "-unlock")) {
547               		  bUnlockOnly = 1; if (bLock) bLock = 0;
548               	  }
549               	  else if (!strcmp(argv[i], "-npe")) bParsingErrors = 0;
550               	  else strcpy(inname, argv[i]);
551                 }
552               
553                 if(inname[0] == '\0') {
554                   usage();
555                   return 1;
556 zlb   1.1       }
557               
558                 if (!bSilent) version();//display version and developer information
559               
560                 p = strrchr(inname, '.');
561               #ifdef WIN32
562                 if((p == NULL) || stricmp(p, ".out")) strcat(inname, ".out");
563               #else
564                 if((p == NULL) || strcmp(p, ".out")) strcat(inname, ".out");
565               #endif
566               
567                 strcpy(outname, inname);
568                 strcat(outname, ".tmp");
569               
570                 Fin = fopen(inname, "r");
571                 if(!Fin) {
572                   fprintf(stderr, "Cannot open %s to read!\n", inname);
573                   exit(1);
574                 }
575                 nLength = _filelength (_fileno(Fin));
576                 if (nLength == -1L) {
577 zlb   1.1         fprintf(stderr, "Cannot get the file size of %s!\n", inname);
578               	fclose (Fin); exit(1);
579                 }
580               
581                 Fout = fopen(outname, "wt");
582                 if(!Fout) {
583                   fprintf(stderr, "Cannot open %s to write!\n", outname);
584                   fclose (Fin); exit(1);
585                 }
586               
587                 b_in = (unsigned char *)malloc(nLength);
588                 if(!b_in) {
589                   fprintf(stderr, "Memory allocation error!\n");
590                   fclose (Fin); fclose (Fout); exit (2);
591                 }
592               
593                 if(bLock) {
594                   fprintf(Fout,"\\let\\WriteBookmarks\\relax\n");
595                 }
596               
597                 nLine = 0;
598 zlb   1.1       while(!feof(Fin))
599                 {
600               	  if( fgets(b_in, nLength, Fin) == NULL) break;
601               	  nLine++;
602               	  //if '\let\WriteBookmarks\relax' is found, skip the current line
603               	  if (strstr(b_in,"\\let\\WriteBookmarks\\relax\n")) continue;
604               	  if (bUnlockOnly) {//only unlock .out file when '-unlock' option is set
605               		  fputs(b_in, Fout);
606               		  continue;
607               	  }
608               	  b2_in = b_in;
609                     while( (*b2_in==' ' || *b2_in=='\n' || *b2_in=='\r' || *b2_in=='\t') && *b2_in!='\0' ) b2_in++;
610               	  if ( *b2_in == '\0') {
611               //		  if (bParsingErrors)
612               //			fprintf(stderr, "Warning: No bookmark content is found:\n\tLine %d: \"%s\"\n", nLine, b_in);
613               //		  fputs(b_in,Fout);//simply copy the wong line into new .out file
614               		  continue;
615               	  }
616               	  if (strnicmp(b2_in, "\\BOOKMARK",9)) {//skip invalid line in .out file
617               		  if (bParsingErrors)
618               			fprintf(stderr, "Invalid line is found:\n\tLine %d: \"%s\"\n", nLine, b_in);
619 zlb   1.1     //		  fputs(b_in,Fout);//simply copy the wong line into new .out file
620               		  continue;
621               	  }
622               	  //find the position of the second parentheses
623               	  //'{' and '}' are displayed as '\173' and '\175' in .out file
624               	  //so there is no nested parentheses
625                     while(*b2_in!='}' && *b2_in!='\0' && *b2_in!='\n' && *b2_in!='\r' && *b2_in!='\t') b2_in++;
626               	  while(*b2_in!='{' && *b2_in!='\0' && *b2_in!='\n' && *b2_in!='\r' && *b2_in!='\t') b2_in++;
627               	  if ( *b2_in == '\0') {
628               		  if (bParsingErrors)
629               			fprintf(stderr, "Warning: No bookmark content is found:\n\tLine %d: \"%s\"\n", nLine, b_in);
630               //		  fputs(b_in,Fout);//simply copy the wong line into new .out file
631               		  continue;
632               	  }
633               	  b2_in ++;
634               	  fwrite ( b_in, 1, b2_in-b_in, Fout);//copy the left part of the new line
635               	  //Set unicode flag and skip the leading characters if encountering '\376\377'
636               	  if( !strncmp(b2_in, "\\376\\377", 8) ) {
637               		  b2_in += 8; bUnicode = 1;
638               	  }
639               	  fputs("\\376\\377",Fout);//set .out file to unicode format in non-unicode mode
640 zlb   1.1     	  b3_in=doparse(b2_in, bUnicode, nLine);//parse the middle part and write the new unicode codes
641               	  fputs(b3_in,Fout);//copy the right part of the new line
642                 }
643               
644                 free (b_in);
645                 fclose(Fin);
646                 fclose(Fout);
647               
648                 sprintf(bakname, "%s.bak",inname);
649                 remove(bakname);
650                 rename(inname,bakname);
651                 rename(outname,inname);
652               
653                 if (!bSilent) fprintf(stdout, "gbk2uni %s is finished!\n",inname);
654               
655               //  getchar();
656                 return 0;
657               
658               }
659               

No admin address has been configured
Powered by
ViewCVS 1.0-dev