|
|
|
|
File: [cvs] / cct / cct / Attic / gbk2uni-hooklee.c
(download)
Revision: 1.1.1.1 (vendor branch), Wed Mar 26 09:52:45 2003 UTC (7 years, 5 months ago) by zlb Branch: zlb CVS Tags: v0_4_9-2, v0_4_9-1 Changes since 1.1: +0 -0 lines cct-0.4.9-1 import |
// gbk2uni.cpp : Transform GBK characters in .out file to unicode codes.
// the initial code is from out2uni in dvipdfmx project of KTUG
// authors: cxterm and Linbo Zhang in 2003
// reach them at http://www.ctex.org
// enhancer: hooklee (Shujun Li) in 2003
// reach hooklee at http://www.hooklee.com or www.chinatex.org
/////////////////////////////////////////////////////////////////////////
//********************hyperref书签文件编码规则**************************
//每个书签以如下形式存放 :\BOOKMARK [1][-]{section.0.1}{书签正文}{}
//非unicode模式下使用hyperref宏包,bookmark中的部分特殊字符以\ooo的形式插入
//' ':\040, '#':\043, '$':\044, '%':\045, '&':\046, '\':\134, '^':\136, '_':\137, '{':\173, '}':\175, '~':176
//比较特殊的是'('和')',是以'\('和'\)'的形式插入的,而不是\ooo形式
//\S:\247
//所有其他字符和汉字均不作任何处理,在bookmark中保留
//已经知道,这种保留会造成部分汉字在bookmark中无法显示
//当使用\CJKchar{"0081}{"040}方式直接以GBK代码的方法插入汉字,bookmark中会生成如下的书签代码:
//"0081"040,显然,pdflatex忽略了\CJKchar命令本身和前后的{}把参数当做普通文本做了转换
//'^^xx^^yy'形式的CJK汉字在.out中有两种可能的出现方式:'^^xx^^yy'和'^^xxL'
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
//unicode模式下使用hyperref宏包,bookmark特殊字符均编码为\ooo\ooo或者\000x或者\000x\80y形式的unicode代码
//书签内容均以\376\377开头作为前导标示符
//经过实验,相应的bookmarkunicode代码插入规则如下:
//*****A类:编码为\ooo\ooo的特殊字符部分*****
//' '(空格):\000\040,使用\textvisiblespace也得到同样的书签
//'#'(\#):\000\043, '$'(\$):\000\044, '%'(\%):\000\045, '&'(\&):\000\046
//'(':\000\050; ')':\000\051
//'\'(\textbackslash):\000\134;
//'^'(\textasciicircum):\000\136; '_'(\_):\000\137
//'{'(\{):\000\173; '}'(\}):\000\175
//'~'(\textasciitilde):\000\176;
//*****B类:编码为\000x的普通字符部分,其中x表示字符本身*****
//abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789
//|:',./!?;"-+=[]`*@(直接用@即可,无需\@)<(或\textless)>(或\textgreater)
//*****C类:单个汉字*****
//假设其高位码为H,低位码为L,则一般的GB汉字其插入形式为:\000H\80L
//但是上述情况存在例外,当L为普通拉丁字符时,将会以\000HL的形式插入
//如果任何汉字出现在一个低位为拉丁字符的GBK汉字之后,第二个汉字会以\80H\000L的形式出现
//只有\80HL是不可能出现的汉字代码
//当书签中包含多个汉字的时候,重复按照上述规则插入,汉字中间的其他字符按照正常规则插入
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
//@注意:当汉字低位字节为字符'}{~\_^'时,tex文档编译会出现错误,强行编译可能出现不可预测的行为
//@插入out文件的内容变得很混乱,一般书签正文会在低位'}'出现之后终止,gbk2uni只尽可能地消除这种影响
//@这可能使得部分GBK汉字在书签中消失或者显示为其他字符
//@使用张林波老师随CCT新版发行的cctconv程序可以解决这个问题
//@cctconv把汉字低位字节为'\', '{', '}', '^', '_', '~'的汉字低位字节分别改为'012345'以方便处理
//@或者使用-f开关转换可以将所有高位为1的字符转换为^^xx的形式,这在一些老的不支持扩展字符的tex系统中有用
//@gbk2uni处理这样的汉字假设cctconv已经运行(cctconv与CJK兼容,无需cct.sty即可得到正确的dvi文件)
//@这样的汉字经cctconv处理后,插入out文件的对应内容在unicode模式下有两种可能:
//@sprintf("\\000%d\\%03o", H, L)和sprintf("%d\\%03o", H, L),这里L已经是被转换回来的'}{~\_^'
//现在还不清楚是否也有sprintf("\\80%d\\%03o", H, L)形式出现(根据'\80HL'不出现推测这种形式可能也不出现)
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
//*****D类:\CJKchar{"00ab}{"0cd}形式的CJK汉字*****
//\000"\0000\0000\000a\000b\000"\0000\000c\000d
//显然,unicode模式的hyperref是如下工作的:
//第一步先生成非unicode模式的out文件,接着对其中的扩展字符做了一个后处理,但是这个处理对汉字不正确
//*****E类:'^^xx^^yy'形式的CJK汉字*****
//在unicode模式下,.out文件的内容一般为'\000^^xx\80^^yy'或者'\000^^xx\80L'
//估计其他CJK汉字的样式也可能出现:'\80^^xx\000^^yy'、'\80^^xx\000L'、'\000^^xx^^yy'、'\000^^xx\ooo'
//以上情况说明,在处理过程中,我们可以像TeX那样将每一个'^^xx'码字当成普通的ASCII字符来处理即可兼容'^^xx'代码
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
//如果\CJKchar{}{}中的前后两个参数位数不同,单从.out文件无法判断第二个参数何时结束
//因此,gbk2uni要求在tex文档中统一采用三位十六进制法\CJKchar{"0xx"}{"0xx}表示前后两个参数
//考虑到在实际中\CJKchar用的比较少,这个约定应该不会算大的限制。
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
#define VERSION "0.22"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#ifdef WIN32
# include <io.h>
# define PATH_MAX _MAX_PATH+1
#else
# include <unistd.h>
# include <limits.h>
#endif
#include "gbk2uni.h"
#ifndef WIN32
# define _fileno(f) f
static size_t _filelength(FILE *f)
{
size_t pos = ftell(f);
size_t length;
fseek(f, 0, SEEK_END);
length = ftell(f);
fseek(f, pos, SEEK_SET);
return length;
}
#endif
#if !defined(WIN32) && !defined(GO32)
static int strnicmp (const char *s0, const char *s1, int n)
{
int i;
while (n-- > 0 && *s0 != '\0' && *s1 != '\0') {
i = toupper(*(s0++)) - toupper(*(s1++));
if (i) return i;
}
return n > 0 ? toupper(*s0) - toupper(*s1) : 0;
}
#endif
#define BYTE unsigned char
#define DWORD unsigned int
//is a valid high byte of some GBK character
#define GBK_HIGH(h) ((0x81<=(h&0xff)) && ((h&0xff)<=0xfe))
//is a valid low byte of some GBK character
#define GBK_LOW(l) ((0x40<=(l&0xff)) && ((l&0xff)<=0xfe))
int bLock = 0;//lock
int bUnlockOnly = 0;
int bCJKchar = 1;//enable \CJKchar support defaultly, disable it with '-nc' option
//int bIgnoreCJK7 = 0;
int bParsingErrors = 1;
//int bVerbose = 0;
int bSilent = 0;
FILE *Fout;
FILE *Fin;
void version(void)
{
printf("gbk2uni, version "VERSION", initially implemented by cxterm and ZLB in Jan. 2003\n");
printf("\t enhanced by hooklee in Mar. 2003.\n");
printf("\t please visit www.ctex.org and www.chinatex.org for more information.\n");
}
//print usage of gbk2uni
void usage(void)
{
version();
printf("Usage : gbk2uni [options] filename[.out] [options]\n");
printf("Options:\n");
printf("\t-u(-l)\t lock .out file to avoid overwritten in the next (pdf)latex run\n");
printf("\t\t (.out file will be unlocked if no '-u' and '-l' options)\n");
printf("\t-unlock\t unlock .out file without parsing .out file\n");
// printf("\t-i\t ignore all CJK characters with \"^^xx^^yy\" format\n");
printf("\t-s\t run gbk2uni silently (but errors remain)\n");
printf("\t-cjk\t parse \\CJKchar{\"0xx}{\"0xx} command (default)\n");
printf("\t-nocjk\t disable parsing \\CJKchar{\"0xx}{\"0xx} command\n");
printf("\t-npe\t disable display of all parsing errors\n");
}
/*
//write unicode into the file Fout
void putucode(unsigned int u)
{
unsigned int h, l;
l = u & 0xff;
h = (u >> 8) & 0xff;
fprintf(Fout,"%c%03o%c%03o",'\\',h,'\\',l);
}*/
//put a GBK code
void putGBKcode(BYTE h, BYTE l,int nLine)
{
unsigned int u;
unsigned int hu, lu;
if (!GBK_HIGH(h) || !GBK_LOW(l)) {
//if current GBK character is not valid, it will be discarded
if (bParsingErrors)
fprintf (stderr, "An invalid GBK character is found:\n\tLine %d: ... 0x%x%x\n", nLine, h,l);
return;
}
u = gbk2uni[(h-0x81)*192 + (l-0x40)];
lu = u & 0xff;
hu = (u >> 8) & 0xff;
fprintf(Fout,"\\%03o\\%03o",hu,lu);
}
//is a character c '0'...'9','a'...'f','A'...'F'?
int is8digit(char c)
{
return (c>='0' && c<='7') ? 1 : 0;
}
//is a character c '0'...'9','a'...'f','A'...'F'?
int is16digit(char c)
{
return isdigit(c) || (c>='a' && c<='f') || (c>='A' && c<='F') ? 1 : 0;
}
//3-digit octal string to decimal number
unsigned char otoi(char *str)
{
return 64*(*str-'0') + 8*(*(str+1)-'0') + (*(str+2)-'0');
}
//2-digit hexadecimal string to decimal number
unsigned char xtoi(char *str)
{
unsigned char h,l;
h = (unsigned char)tolower(*str);
l = (unsigned char)tolower(*(str+1));
if isdigit(h) h = h - '0';
else h = h - 'a' + 10;
if isdigit(l) l = l - '0';
else l = l - 'a' + 10;
return 16*h + l;
}
//parse '\000"\000x\000x' generated by \CJKchar{}{} command
//this function is used to skip the leading string '\000'
int getCJKchar(char **str,int nLine)
{
int i;
while(**str!='\\' && **str != '\0' && **str != '}') (*str)++;//find the next '\\'
if (**str == '}' || **str == '\0') return 0;
(*str)++;
for (i=0; i < 3; i++) {
if(**str != '0') break;
(*str)++;
}
if (i != 3) {
if (bParsingErrors)
fprintf (stderr, "An incomplete \\CJKchar{}{} command is found:\n\tLine %d: ... \"%s\"\n", nLine, (*str)-i-1);
return -1;
}
return 1;
}
//translate a '^^xx'-format TeX character to an ascii character
//if not a '^^xx'-format TeX character, return itself
int translateChar(char **str,int nLine)
{
BYTE a;
if ( **str != '^') {
a = **str;
if ( **str != '}' && **str != '\0' ) (*str)++;
return a;//if not '^^xx' directly return the current character
}
while(**str == '^') (*str)++;//skip all '^' characters
if ( is16digit(**str) && is16digit(*(*str+1)) ) {
a = xtoi(*str);
(*str) = *str + 2;
return a;
}
else {
if (bParsingErrors)
fprintf (stderr, "An incomplete '^^xx' TeX character is found:\n\tLine %d: ... \"%s\"\n", nLine, (*str)-2);
return -1;//-1L = 0xffffffff
}
}
//parse the bookmark and generate corresponding unicode codes
char *doparse(char *str,BYTE bUnicode,int nLine)
{
BYTE lh,h,l;//h denotes high byte and l denotes low byte of a unicode character,lh denotes the leading '\ooo'
int i, rtn;
char strCode[4];
//skip to the next valid character... needed or not?
while(1) {
if(bUnicode)
while(*str==' ' || *str=='\t' || *str=='\n' || *str=='\r') str++;
else
while(*str=='\t' || *str=='\n' || *str=='\r') str++;
if ( *str == '}' || *str == '\0') return str;//end
switch(*str) {
case '\\':
//original unicode codes generated by pdflatex, including '\(' and '\)'
//note: '\oo' and '\par' may occur in wrongly-complied tex document
while(*str == '\\') str++;//occasionally double '\' may occur in a wrong .out file
//processing '\ooo' in non-unicode mode
if(!bUnicode) {
//processing '\(' and '\)' in non-unicode
if ( *str == '(' || *str == ')') {
fprintf(Fout,"\\000\\%03o", *str++);
break;
}
//in non-unicode mode, '\ooo' is possible for special latin character, such as '\S'
if isdigit(*str) {
fprintf(Fout,"\\000\\");//add '\000' prefix to current special unicode character
i=0;
while(1) {
fputc(*str++,Fout);i++;
if(!isdigit(*str) || i >=3) break;
}//to avoid less than three digital characters after '\'
//fwrite(Fount,1,3,str);str+=3;
if ( i < 3 && bParsingErrors)
fprintf (stderr, "An incomplete special character is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-1);
}
break;
}
//processing '\ooo\ooo' or '\000x' or '\000H\80L' or '\000HL' or '\80H\000L' in unicode mode
//here please note that either 'H' or 'L' or both two can be '^^xx'-format
if isdigit(*str) {
//in unicode mode, '\ooo\ooo' and '\000x' and '\000H\80L' are all possible for different characters
//possibly, '\oo' should be taken into consideration to avoid possible collapse of gbk2uni
strCode[0]=*str++;
for (i=1; i<3; i++) {
if(isdigit(*str)) strCode[i]=*str++;
else break;
}
strCode[i]='\0';
if (i == 1) {
if (bParsingErrors)
fprintf (stderr, "An incomplete special unicode code is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-1);
break;
}
lh = atoi(strCode);//get the high byte of current unicode character
if(i ==3 && *str == '\\') {//'\ooo\ooo': normal unicode character
fprintf(Fout, "\\%s\\", strCode);//directly output leading '\ooo\'
str++;
for (i=0; i<3; i++) {
if(isdigit(*str)) strCode[i]=*str++;//directly output the left 'ooo'
else break;
}
strCode[i]='\0';
if (i < 3) {
if (bParsingErrors)
fprintf (stderr, "An incomplete unicode code is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-1);
}
else fprintf(Fout, "%s", strCode);//directly output the left 'ooo'
break;
}
if (lh == 0 && *str == '\"' && bCJKchar) {//\CJKchar{"0xx}{"0xx} command in unicode mode
rtn = getCJKchar (&str, nLine);
if(rtn == 0) return str;
if(rtn == -1) break;//skip the first '\0000'
rtn = getCJKchar (&str, nLine);
if(rtn == 0) return str;
if(rtn == -1) break;
strCode[0] = *str++;//get the first digit of high byte
rtn = getCJKchar (&str, nLine);
if(rtn == 0) return str;
if(rtn == -1) break;
strCode[1] = *str++;//get the second digit of high byte
h = xtoi (strCode);//get high byte
rtn = getCJKchar (&str, nLine);
if(rtn == 0) return str;
if(rtn == -1) break;
if (*str != '\"') {//is the third unicode code '"'?
if (bParsingErrors)
fprintf (stderr, "An incomplete \\CJKchar{}{} command is found:\n\tLine %d: ... \"%s\"\n", nLine, str-4);
break;
}
rtn = getCJKchar (&str, nLine);
if(rtn == 0) return str;
if(rtn == -1) break;//skip the second '\0000'
rtn = getCJKchar (&str, nLine);
if(rtn == 0) return str;
if(rtn == -1) break;
strCode[0] = *str++;//get the first digit of low byte
rtn = getCJKchar (&str, nLine);
if(rtn == 0) return str;
if(rtn == -1) break;
strCode[1] = *str++;//get the second digit of low byte
l = xtoi (strCode);//get low byte
putGBKcode(h, l, nLine);//put unicode code via GBK2UNICODE transformation
break;
}
//'\000x' or '\000H\80L' or '\80H\000L' or '\000H\ooo'
rtn = translateChar(&str,nLine);
if (rtn == -1) break;//break when encountering errors
else h = (BYTE) rtn;
if (lh == 0 && h != 0 && h != '}' && h < 0x80) {//'\000x' format remains
//translate '\000x' to '\000\ooo' to get more robust result
fprintf(Fout, "\\000\\%03o", h);
break;
}
if (lh == 0 && h > 0x80) {//'\000HL' or '\000H\80L' or '\000H\ooo'
//h = (BYTE) *str++;//set high GBK byte
if ( *str != '\\') {//'\000HL'
rtn = translateChar(&str,nLine);
if (rtn == -1) break;//break when encountering errors
else l = (BYTE) rtn;
//l = *str++;
putGBKcode(h, l, nLine);//put unicode code via GBK2UNICODE transformation
}
else {//'\000H\80L' or '\000H\ooo'
if (*(str+1) == '8' && *(str+2) == '0') {// is '80L' after '\'?
str += 3;
rtn = translateChar(&str,nLine);
if (rtn == -1) break;//break when encountering errors
else l = (BYTE) rtn;
//l = *str++;//set low GBK byte
putGBKcode(h, l, nLine);//put unicode code via GBK2UNICODE transformation
}
else if ( is8digit(*(str+1)) && is8digit(*(str+2)) && is8digit(*(str+3)) ) {
l = otoi(str+1);
putGBKcode(h, l, nLine);//put a GBK code
/* if (l == '{' || l == '}' || l == '\\' || l == '^' || l == '_' || l == '~' || l == 0x80)
putGBKcode(h, l, nLine);//put a GBK code
else if (bParsingErrors)
fprintf (stderr, "An invalid GBK character (in cctconv format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-2);
*/ str += 4;
}
else if (bParsingErrors)
fprintf (stderr, "An incomplete GBK character is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-1);
}
break;
}
if (lh == 80 && h > 0x80) {//'\80H\000L'
//h = (BYTE) *str++;//set high GBK byte
if (*str == '\\' && *(str+1) == '0' && *(str+2) == '0' && *(str+3) == '0') {// is '\000L' after '\80H'?
str += 4;
rtn = translateChar(&str,nLine);
if (rtn == -1) break;//break when encountering errors
else l = (BYTE) rtn;
//l = *str++;//set low GBK byte
putGBKcode(h, l, nLine);//put unicode code via GBK2UNICODE transformation
}
else if (bParsingErrors)
fprintf (stderr, "An incomplete GBK character is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-1);
break;
}
if (h == 80 && *str > 0) {//is '\80x' possible?
if(*str != '}' && *str != '\0') str++;//goto the next code
}
break;
}
//remove '\par' from .out file
if(*str == 'p' && *(str+1) == 'a' && *(str+2) == 'r') str+=3;
break;
/* case '^'://GBK characters with CJK format '^^xx^^yy'
while(*str=='^') str++;//skip all '^' characters
if (is16digit(*str) && is16digit(*(str+1))) {
h = xtoi(str); str += 2;
while(*str=='^') str++;//skip all '^' characters
if (is16digit(*str) && is16digit(*(str+1))) {
l = xtoi(str); str += 2;
if (!bIgnoreCJK7) putGBKcode(h, l, nLine);//put a GBK code if not ignoring
}
else if (*str < 0)
else if (bParsingErrors)
fprintf (stderr, "An incomplete GBK character (in CJK format) is found:\n\tLine %d: ... \"^^%s\"\n", nLine, str-2);
}
else if (bParsingErrors)
fprintf (stderr, "An incomplete GBK character (in CJK format) is found:\n\tLine %d: ... \"^^%s\"\n", nLine, str);
break;
*/ case '\"'://\CJKchar{"0xx}{"0xx} command in non-unicode mode?
if (!bCJKchar) fprintf(Fout, "\\000\\%03o", *str++);//normal '"' character in non-unicode mode
else {//\CJKchar{"0xx}{"0xx} command in non-unicode mode
while(*str=='\"') str++;//skip all '"' characters
if (*str == '0') str++;//skip the first '0'
if (is16digit(*str) && is16digit(*(str+1))) {
h = xtoi(str); str += 2;
while(*str=='\"') str++;//skip all '"' characters
if (*str == '0') str++;//skip the second '0'
if (is16digit(*str) && is16digit(*(str+1))) {
l = xtoi(str); str += 2;
putGBKcode(h, l, nLine);//put a GBK code
}
else if (bParsingErrors)
fprintf (stderr, "An incomplete GBK character (in \\CJKchar{}{} format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str-2);
}
else if (bParsingErrors)
fprintf (stderr, "An incomplete GBK character (in \\CJKchar{}{} format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str);
}
break;
default://normal characters in non-unicode mode or cctconv GBK characters in both mode
//or '^^xx^^yy'/'^^xxL' TeX characters
rtn = translateChar(&str,nLine);
if (rtn == -1) break;//break when encountering errors
else h = (BYTE) rtn;
if (h != 0 && h != '}' && h < 0x80) fprintf(Fout, "\\000\\%03o", h);//normal latin character
else {//GBK character
//h = *str++;//GBK high byte
if (*str == '\\') {//characters generated by cctconv
str++;
for(i = 0; i < 3; i++) {
if(is8digit(*str)) strCode[i] = *str++;
else break;
}
strCode[i] = '\0';
if ( i != 3) {
if (bParsingErrors)
fprintf (stderr, "An incomplete GBK character (in cctconv format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-2);
break;
}
l = otoi(strCode);//get low byte from '\ooo'
putGBKcode(h, l, nLine);//put a GBK code
/* if (l == '{' || l == '}' || l == '\\' || l == '^' || l == '_' || l == '~')
putGBKcode(h, l, nLine);//put a GBK code
else if (bParsingErrors)
fprintf (stderr, "An invalid GBK character (in cctconv format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-2);
*/ break;
}
//low byte of a normal CJK character or '^^yy'
rtn = translateChar(&str,nLine);
if (rtn == -1) break;//break when encountering errors
else l = (BYTE) rtn;
switch(l) {
case '}':
case '\0':
if (bParsingErrors)
fprintf (stderr, "An incomplete GBK character (in '^^xx^^yy' format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str);
return str;//SHOULD exit when reading '\0' or '}'
case '0': l = '\\'; break;//reserved for future CCT
case '1': l = '{'; break;//reserved for future CCT
case '2': l = '}'; break;//reserved for future CCT
case '3': l = '^'; break;//reserved for future CCT
case '4': l = '_'; break;//reserved for future CCT
case '5': l = '~'; break;//reserved for future CCT
case '6': l = 0x80; break;//reserved for future CCT
default: ;//normal GBK character or '^^yy'
}
putGBKcode(h, l, nLine);//put a GBK code
//str++;
}
}
}
}
int main(int argc, char* argv[])
{
char inname[PATH_MAX]="";
char outname[PATH_MAX]="";
char bakname[PATH_MAX]="";
char *p;
unsigned int nLength;
unsigned char *b_in,*b2_in,*b3_in;
BYTE bUnicode=0;
int nLine, i;
for(i=1; i < argc; i++) {
#if defined(WIN32) || defined(GO32)
strlwr(argv[i]);
#endif
if (!strcmp(argv[i], "-u") || !strcmp(argv[i], "-l")) {
bLock = 1; if (bUnlockOnly) bUnlockOnly = 0;
}
// else if (!strcmp(argv[i], "-i")) bIgnoreCJK7 = 1;
// else if (!strcmp(argv[i], "-v")) bVerbose = 1;
else if (!strcmp(argv[i], "-s")) bSilent = 1;
else if (!strcmp(argv[i], "-cjk")) bCJKchar = 1;
else if (!strcmp(argv[i], "-nocjk")) bCJKchar = 0;
else if (!strcmp(argv[i], "-unlock")) {
bUnlockOnly = 1; if (bLock) bLock = 0;
}
else if (!strcmp(argv[i], "-npe")) bParsingErrors = 0;
else strcpy(inname, argv[i]);
}
if(inname[0] == '\0') {
usage();
return 1;
}
if (!bSilent) version();//display version and developer information
p = strrchr(inname, '.');
#ifdef WIN32
if((p == NULL) || stricmp(p, ".out")) strcat(inname, ".out");
#else
if((p == NULL) || strcmp(p, ".out")) strcat(inname, ".out");
#endif
strcpy(outname, inname);
strcat(outname, ".tmp");
Fin = fopen(inname, "r");
if(!Fin) {
fprintf(stderr, "Cannot open %s to read!\n", inname);
exit(1);
}
nLength = _filelength (_fileno(Fin));
if (nLength == -1L) {
fprintf(stderr, "Cannot get the file size of %s!\n", inname);
fclose (Fin); exit(1);
}
Fout = fopen(outname, "wt");
if(!Fout) {
fprintf(stderr, "Cannot open %s to write!\n", outname);
fclose (Fin); exit(1);
}
b_in = (unsigned char *)malloc(nLength);
if(!b_in) {
fprintf(stderr, "Memory allocation error!\n");
fclose (Fin); fclose (Fout); exit (2);
}
if(bLock) {
fprintf(Fout,"\\let\\WriteBookmarks\\relax\n");
}
nLine = 0;
while(!feof(Fin))
{
if( fgets(b_in, nLength, Fin) == NULL) break;
nLine++;
//if '\let\WriteBookmarks\relax' is found, skip the current line
if (strstr(b_in,"\\let\\WriteBookmarks\\relax\n")) continue;
if (bUnlockOnly) {//only unlock .out file when '-unlock' option is set
fputs(b_in, Fout);
continue;
}
b2_in = b_in;
while( (*b2_in==' ' || *b2_in=='\n' || *b2_in=='\r' || *b2_in=='\t') && *b2_in!='\0' ) b2_in++;
if ( *b2_in == '\0') {
// if (bParsingErrors)
// fprintf(stderr, "Warning: No bookmark content is found:\n\tLine %d: \"%s\"\n", nLine, b_in);
// fputs(b_in,Fout);//simply copy the wong line into new .out file
continue;
}
if (strnicmp(b2_in, "\\BOOKMARK",9)) {//skip invalid line in .out file
if (bParsingErrors)
fprintf(stderr, "Invalid line is found:\n\tLine %d: \"%s\"\n", nLine, b_in);
// fputs(b_in,Fout);//simply copy the wong line into new .out file
continue;
}
//find the position of the second parentheses
//'{' and '}' are displayed as '\173' and '\175' in .out file
//so there is no nested parentheses
while(*b2_in!='}' && *b2_in!='\0' && *b2_in!='\n' && *b2_in!='\r' && *b2_in!='\t') b2_in++;
while(*b2_in!='{' && *b2_in!='\0' && *b2_in!='\n' && *b2_in!='\r' && *b2_in!='\t') b2_in++;
if ( *b2_in == '\0') {
if (bParsingErrors)
fprintf(stderr, "Warning: No bookmark content is found:\n\tLine %d: \"%s\"\n", nLine, b_in);
// fputs(b_in,Fout);//simply copy the wong line into new .out file
continue;
}
b2_in ++;
fwrite ( b_in, 1, b2_in-b_in, Fout);//copy the left part of the new line
//Set unicode flag and skip the leading characters if encountering '\376\377'
if( !strncmp(b2_in, "\\376\\377", 8) ) {
b2_in += 8; bUnicode = 1;
}
fputs("\\376\\377",Fout);//set .out file to unicode format in non-unicode mode
b3_in=doparse(b2_in, bUnicode, nLine);//parse the middle part and write the new unicode codes
fputs(b3_in,Fout);//copy the right part of the new line
}
free (b_in);
fclose(Fin);
fclose(Fout);
sprintf(bakname, "%s.bak",inname);
remove(bakname);
rename(inname,bakname);
rename(outname,inname);
if (!bSilent) fprintf(stdout, "gbk2uni %s is finished!\n",inname);
// getchar();
return 0;
}
| No admin address has been configured |
Powered by ViewCVS 1.0-dev |