From: Jean-Marc Lasgouttes Date: Mon, 6 Mar 2017 16:08:38 +0000 (+0100) Subject: Update bundled mythes to version 1.2.5 X-Git-Tag: 2.3.0alpha1~273 X-Git-Url: https://git.lyx.org/gitweb/?a=commitdiff_plain;h=0e50ad8b164724b8e05005c6c5e21d51cd8c8421;p=lyx.git Update bundled mythes to version 1.2.5 Move it to 3rdparty/ directory alongside the other ones. --- diff --git a/3rdparty/Makefile.am b/3rdparty/Makefile.am index 02dbfef70b..c6cc780644 100644 --- a/3rdparty/Makefile.am +++ b/3rdparty/Makefile.am @@ -1,6 +1,6 @@ include $(top_srcdir)/config/common.am -DIST_SUBDIRS = boost hunspell libiconv zlib +DIST_SUBDIRS = boost hunspell mythes libiconv zlib if USE_INCLUDED_BOOST if LYX_USE_STD_REGEX @@ -13,6 +13,10 @@ if USE_INCLUDED_HUNSPELL HUNSPELL = hunspell endif +if USE_INCLUDED_MYTHES +MYTHES = mythes +endif + if USE_INCLUDED_ICONV ICONV = libiconv endif @@ -21,4 +25,4 @@ if USE_INCLUDED_ZLIB ZLIB = zlib endif -SUBDIRS = $(BOOST) $(HUNSPELL) $(ICONV) $(ZLIB) +SUBDIRS = $(BOOST) $(HUNSPELL) $(MYTHES) $(ICONV) $(ZLIB) diff --git a/3rdparty/mythes/1.2.5/AUTHORS b/3rdparty/mythes/1.2.5/AUTHORS new file mode 100644 index 0000000000..274fd9a40f --- /dev/null +++ b/3rdparty/mythes/1.2.5/AUTHORS @@ -0,0 +1,3 @@ +Kevin Hendricks +Németh László +Caolán McNamara diff --git a/3rdparty/mythes/1.2.5/COPYING b/3rdparty/mythes/1.2.5/COPYING new file mode 100644 index 0000000000..b6bf70a0c7 --- /dev/null +++ b/3rdparty/mythes/1.2.5/COPYING @@ -0,0 +1,34 @@ +/* + * Copyright 2003 Kevin B. Hendricks, Stratford, Ontario, Canada + * And Contributors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All modifications to the source code must be clearly marked as + * such. Binary redistributions based on modified source code + * must be clearly marked as modified versions in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ diff --git a/3rdparty/mythes/1.2.5/README b/3rdparty/mythes/1.2.5/README new file mode 100644 index 0000000000..35ee09441c --- /dev/null +++ b/3rdparty/mythes/1.2.5/README @@ -0,0 +1,63 @@ +MyThes is a simple thesaurus that uses a structured +text data file and an index file with binary search +to lookup words and phrases and return information +on part of speech, meanings, and synonyms + +MyThes was originall written to provide a thesaurus +for the OpenOffice.org project + +The Main features of MyThes are: + +1. written in C++ to make it easier to interface with + LibreOffice, OpenOffice, AbiWord, Pspell, etc + +2. it is stateless, uses no static variables and + should be completely reentrant with no ifdefs + +3. it compiles with -ansi and -pedantic and -Wall + with no warnigns so it shouldbe quite portable + +4. it uses a simple perl program to read the structured + text file and create the index needed for binary + searching + +5. it is very simple with *lots* of comments. + The main "smarts" are in the structure of the + text file that makes up the thesaurus data + +6. It comes with a ready-to-go structured thesaurus + data file for en_US extracted from the WordNet-2.0 data. + + Please see WordNet_license.txt and WordNet_readme.txt + for more information on the very useful project! + + See http://www.danielnaber.de/wn2ooo/ for utilities to + regenerate an up to date English thesaurus from the most + recent WordNet data. + +7. The source code has a BSD license (and no advertising clause) + + +MyThes comes with a simple example program that looks up some words and returns +meanings and synonyms. + +To build it simply do the following: + +unzip mythes.zip +cd mythes +./configure +make + +To run the example program: +./example th_en_US_new.idx th_en_US_new.dat checkme.lst + +To run the example program with stemming and morphological generation: +e.g. to check mouse, mice, rodents, eats, eaten, ate, eating etc. words +./example morph.idx morph.dat morph.lst morph.aff morph.dic + +NOTE: this is only an example and test environment for dictionary developers, +full English stemming and morphological generation needs an improved +English Hunspell dictionary. + +László Németh +Kevin Hendricks diff --git a/3rdparty/mythes/1.2.5/mythes.cxx b/3rdparty/mythes/1.2.5/mythes.cxx new file mode 100644 index 0000000000..675bbfe3cc --- /dev/null +++ b/3rdparty/mythes/1.2.5/mythes.cxx @@ -0,0 +1,375 @@ +#include "COPYING" +#include +#include +#include +#include +#include +#include + +#include "mythes.hxx" + +MyThes::MyThes(const char* idxpath, const char * datpath) +{ + nw = 0; + encoding = NULL; + list = NULL; + offst = NULL; + pdfile = NULL; + + if (thInitialize(idxpath, datpath) != 1) { + fprintf(stderr,"Error - can't open %s or %s\n",idxpath, datpath); + fflush(stderr); + thCleanup(); + // did not initialize properly - throw exception? + } +} + + +MyThes::~MyThes() +{ + thCleanup(); +} + + +int MyThes::thInitialize(const char* idxpath, const char* datpath) +{ + + // open the index file + FILE * pifile = fopen(idxpath,"r"); + if (!pifile) { + return 0; + } + + // parse in encoding and index size */ + std::vector buffer(MAX_WD_LEN); + char * wrd = &buffer[0]; + readLine(pifile,wrd,MAX_WD_LEN); + encoding = mystrdup(wrd); + readLine(pifile,wrd,MAX_WD_LEN); + int idxsz = atoi(wrd); + + if (idxsz <= 0 || idxsz > std::numeric_limits::max() / sizeof(sizeof(char*))) { + fprintf(stderr,"Error - bad index %d\n", idxsz); + fclose(pifile); + return 0; + } + + // now allocate list, offst for the given size + list = (char**) calloc(idxsz,sizeof(char*)); + offst = (unsigned int*) calloc(idxsz,sizeof(unsigned int)); + + if ( (!(list)) || (!(offst)) ) { + fprintf(stderr,"Error - bad memory allocation\n"); + fclose(pifile); + return 0; + } + + // now parse the remaining lines of the index + int len = readLine(pifile,wrd,MAX_WD_LEN); + while (len > 0) + { + int np = mystr_indexOfChar(wrd,'|'); + if (nw < idxsz) { + if (np >= 0) { + *(wrd+np) = '\0'; + list[nw] = (char *)calloc(1,(np+1)); + if (!list[nw]) { + fprintf(stderr,"Error - bad memory allocation\n"); + fflush(stderr); + fclose(pifile); + return 0; + } + memcpy((list[nw]),wrd,np); + offst[nw] = atoi(wrd+np+1); + nw++; + } + } + len = readLine(pifile,wrd,MAX_WD_LEN); + } + + fclose(pifile); + + /* next open the data file */ + pdfile = fopen(datpath,"r"); + if (!pdfile) { + return 0; + } + + return 1; +} + + +void MyThes::thCleanup() +{ + /* first close the data file */ + if (pdfile) { + fclose(pdfile); + pdfile=NULL; + } + + if (list) + { + /* now free up all the allocated strings on the list */ + for (int i=0; i < nw; i++) + { + if (list[i]) { + free(list[i]); + list[i] = 0; + } + } + free((void*)list); + } + + if (encoding) free((void*)encoding); + if (offst) free((void*)offst); + + encoding = NULL; + list = NULL; + offst = NULL; + nw = 0; +} + + + +// lookup text in index and count of meanings and a list of meaning entries +// with each entry having a synonym count and pointer to an +// array of char * (i.e the synonyms) +// +// note: calling routine should call CleanUpAfterLookup with the original +// meaning point and count to properly deallocate memory + +int MyThes::Lookup(const char * pText, int len, mentry** pme) +{ + + *pme = NULL; + + // handle the case of missing file or file related errors + if (! pdfile) return 0; + + long offset = 0; + + /* copy search word and make sure null terminated */ + std::vector buffer(len+1); + char * wrd = &buffer[0]; + memcpy(wrd,pText,len); + + /* find it in the list */ + int idx = nw > 0 ? binsearch(wrd,list,nw) : -1; + if (idx < 0) return 0; + + // now seek to the offset + offset = (long) offst[idx]; + int rc = fseek(pdfile,offset,SEEK_SET); + if (rc) { + return 0; + } + + // grab the count of the number of meanings + // and allocate a list of meaning entries + char * buf = NULL; + buf = (char *) malloc( MAX_LN_LEN ); + if (!buf) return 0; + readLine(pdfile, buf, (MAX_LN_LEN-1)); + int np = mystr_indexOfChar(buf,'|'); + if (np < 0) { + free(buf); + return 0; + } + int nmeanings = atoi(buf+np+1); + if (nmeanings < 0 || nmeanings > std::numeric_limits::max() / sizeof(mentry)) + nmeanings = 0; + *pme = (mentry*)(nmeanings ? malloc(nmeanings * sizeof(mentry)) : NULL); + if (!(*pme)) { + free(buf); + return 0; + } + + // now read in each meaning and parse it to get defn, count and synonym lists + mentry* pm = *(pme); + char dfn[MAX_WD_LEN]; + + for (int j = 0; j < nmeanings; j++) { + readLine(pdfile, buf, (MAX_LN_LEN-1)); + + pm->count = 0; + pm->psyns = NULL; + pm->defn = NULL; + + // store away the part of speech for later use + char * p = buf; + char * pos = NULL; + np = mystr_indexOfChar(p,'|'); + if (np >= 0) { + *(buf+np) = '\0'; + pos = mystrdup(p); + p = p + np + 1; + } else { + pos = mystrdup(""); + } + + // count the number of fields in the remaining line + int nf = 1; + char * d = p; + np = mystr_indexOfChar(d,'|'); + while ( np >= 0 ) { + nf++; + d = d + np + 1; + np = mystr_indexOfChar(d,'|'); + } + pm->count = nf; + pm->psyns = (char **) malloc(nf*sizeof(char*)); + + // fill in the synonym list + d = p; + for (int jj = 0; jj < nf; jj++) + { + np = mystr_indexOfChar(d,'|'); + if (np > 0) + { + *(d+np) = '\0'; + pm->psyns[jj] = mystrdup(d); + d = d + np + 1; + } + else + { + pm->psyns[jj] = mystrdup(d); + } + } + + // add pos to first synonym to create the definition + if (pm->psyns[0]) + { + int k = strlen(pos); + int m = strlen(pm->psyns[0]); + if ((k+m) < (MAX_WD_LEN - 1)) { + strncpy(dfn,pos,k); + *(dfn+k) = ' '; + strncpy((dfn+k+1),(pm->psyns[0]),m+1); + pm->defn = mystrdup(dfn); + } else { + pm->defn = mystrdup(pm->psyns[0]); + } + } + free(pos); + pm++; + + } + free(buf); + + return nmeanings; +} + + + +void MyThes::CleanUpAfterLookup(mentry ** pme, int nmeanings) +{ + + if (nmeanings == 0) return; + if ((*pme) == NULL) return; + + mentry * pm = *pme; + + for (int i = 0; i < nmeanings; i++) { + int count = pm->count; + for (int j = 0; j < count; j++) { + if (pm->psyns[j]) free(pm->psyns[j]); + pm->psyns[j] = NULL; + } + if (pm->psyns) free(pm->psyns); + pm->psyns = NULL; + if (pm->defn) free(pm->defn); + pm->defn = NULL; + pm->count = 0; + pm++; + } + pm = *pme; + free(pm); + *pme = NULL; + return; +} + + +// read a line of text from a text file stripping +// off the line terminator and replacing it with +// a null string terminator. +// returns: -1 on error or the number of characters in +// in the returning string + +// A maximum of nc characters will be returned + +int MyThes::readLine(FILE * pf, char * buf, int nc) +{ + + if (fgets(buf,nc,pf)) { + mychomp(buf); + return strlen(buf); + } + return -1; +} + + + +// performs a binary search on null terminated character +// strings +// +// returns: -1 on not found +// index of wrd in the list[] + +int MyThes::binsearch(char * sw, char* _list[], int nlst) +{ + int lp, up, mp, j, indx; + lp = 0; + up = nlst-1; + indx = -1; + if (strcmp(sw,_list[lp]) < 0) return -1; + if (strcmp(sw,_list[up]) > 0) return -1; + while (indx < 0 ) { + mp = (int)((lp+up) >> 1); + j = strcmp(sw,_list[mp]); + if ( j > 0) { + lp = mp + 1; + } else if (j < 0 ) { + up = mp - 1; + } else { + indx = mp; + } + if (lp > up) return -1; + } + return indx; +} + +char * MyThes::get_th_encoding() +{ + return encoding; +} + + +// string duplication routine +char * MyThes::mystrdup(const char * s) +{ + char * d = NULL; + if (s) { + int sl = strlen(s)+1; + d = (char *) malloc(sl); + if (d) memcpy(d,s,sl); + } + return d; +} + +// remove cross-platform text line end characters +void MyThes::mychomp(char * s) +{ + int k = strlen(s); + if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0'; + if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; +} + + +// return index of char in string +int MyThes::mystr_indexOfChar(const char * d, int c) +{ + char * p = strchr((char *)d,c); + if (p) return (int)(p-d); + return -1; +} + diff --git a/3rdparty/mythes/1.2.5/mythes.hxx b/3rdparty/mythes/1.2.5/mythes.hxx new file mode 100644 index 0000000000..eff8aea1bf --- /dev/null +++ b/3rdparty/mythes/1.2.5/mythes.hxx @@ -0,0 +1,76 @@ +#ifndef _MYTHES_HXX_ +#define _MYTHES_HXX_ + +// some maximum sizes for buffers +#define MAX_WD_LEN 200 +#define MAX_LN_LEN 16384 + + +// a meaning with definition, count of synonyms and synonym list +struct mentry { + char* defn; + int count; + char** psyns; +}; + + +class MyThes +{ + + int nw; /* number of entries in thesaurus */ + char** list; /* stores word list */ + unsigned int* offst; /* stores offset list */ + char * encoding; /* stores text encoding; */ + + FILE *pdfile; + + // disallow copy-constructor and assignment-operator for now + MyThes(); + MyThes(const MyThes &); + MyThes & operator = (const MyThes &); + +public: + MyThes(const char* idxpath, const char* datpath); + ~MyThes(); + + // lookup text in index and return number of meanings + // each meaning entry has a defintion, synonym count and pointer + // when complete return the *original* meaning entry and count via + // CleanUpAfterLookup to properly handle memory deallocation + + int Lookup(const char * pText, int len, mentry** pme); + + void CleanUpAfterLookup(mentry** pme, int nmean); + + char* get_th_encoding(); + +private: + // Open index and dat files and load list array + int thInitialize (const char* indxpath, const char* datpath); + + // internal close and cleanup dat and idx files + void thCleanup (); + + // read a text line (\n terminated) stripping off line terminator + int readLine(FILE * pf, char * buf, int nc); + + // binary search on null terminated character strings + int binsearch(char * wrd, char* list[], int nlst); + + // string duplication routine + char * mystrdup(const char * p); + + // remove cross-platform text line end characters + void mychomp(char * s); + + // return index of char in string + int mystr_indexOfChar(const char * d, int c); + +}; + +#endif + + + + + diff --git a/3rdparty/mythes/Makefile.am b/3rdparty/mythes/Makefile.am new file mode 100644 index 0000000000..dcc2b1957d --- /dev/null +++ b/3rdparty/mythes/Makefile.am @@ -0,0 +1,12 @@ +include $(top_srcdir)/config/common.am + +noinst_LIBRARIES = liblyxmythes.a + +EXTRA_DIST = \ + 1.2.5/AUTHORS \ + 1.2.5/COPYING \ + 1.2.5/README + +liblyxmythes_a_SOURCES = \ + 1.2.5/mythes.cxx \ + 1.2.5/myspell.hxx diff --git a/config/lyxinclude.m4 b/config/lyxinclude.m4 index 8287fb2f5d..9a0bef81a2 100644 --- a/config/lyxinclude.m4 +++ b/config/lyxinclude.m4 @@ -652,14 +652,16 @@ AC_DEFUN([LYX_USE_INCLUDED_MYTHES],[ break]) AC_LANG_POP(C++) fi - if test $use_included_mythes = no ; then - AC_DEFINE(USE_EXTERNAL_MYTHES, 1, [Define as 1 to use an external MyThes library]) - AC_DEFINE_UNQUOTED(MYTHES_H_LOCATION,$mythes_h_location,[Location of mythes.hxx]) - AC_SUBST(MYTHES_LIBS) - else + if test $use_included_mythes = yes ; then + mythes_h_location="" + MYTHES_INCLUDES='-I$(top_srcdir)/3rdparty/mythes/1.2.5/' + MYTHES_LIBS='$(top_builddir)/3rdparty/mythes/liblyxmythes.a' lyx_included_libs="$lyx_included_libs mythes" fi AM_CONDITIONAL(USE_INCLUDED_MYTHES, test x$use_included_mythes = xyes) + AC_DEFINE_UNQUOTED(MYTHES_H_LOCATION,$mythes_h_location,[Location of mythes.hxx]) + AC_SUBST(MYTHES_INCLUDES) + AC_SUBST(MYTHES_LIBS) AC_MSG_CHECKING([whether to use included MyThes library]) AC_MSG_RESULT([$use_included_mythes]) ]) diff --git a/configure.ac b/configure.ac index ce4eb96215..694ddc48fb 100644 --- a/configure.ac +++ b/configure.ac @@ -372,6 +372,7 @@ AC_CONFIG_FILES([Makefile \ 3rdparty/Makefile \ 3rdparty/boost/Makefile \ 3rdparty/hunspell/Makefile \ + 3rdparty/mythes/Makefile \ 3rdparty/libiconv/Makefile \ $ICONV_ICONV_H_IN \ 3rdparty/zlib/Makefile \ diff --git a/src/Makefile.am b/src/Makefile.am index 3f3e99a0e6..9576076d12 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -4,7 +4,7 @@ include $(top_srcdir)/config/common.am AM_CPPFLAGS += -I$(top_srcdir)/src AM_CPPFLAGS += $(BOOST_INCLUDES) $(ICONV_INCLUDES) $(ZLIB_INCLUDES) -AM_CPPFLAGS += $(ENCHANT_CFLAGS) $(HUNSPELL_CFLAGS) +AM_CPPFLAGS += $(ENCHANT_CFLAGS) $(HUNSPELL_CFLAGS) $(MYTHES_INCLUDES) AM_CPPFLAGS += $(QT_CPPFLAGS) $(QT_CORE_INCLUDES) if BUILD_CLIENT_SUBDIR diff --git a/src/Thesaurus.cpp b/src/Thesaurus.cpp index 4f4ad64d14..cf8567eb53 100644 --- a/src/Thesaurus.cpp +++ b/src/Thesaurus.cpp @@ -27,12 +27,8 @@ #include "support/lstrings.h" #include "support/os.h" -#ifdef USE_EXTERNAL_MYTHES -#include MYTHES_H_LOCATION -#else #include -#include "support/mythes/mythes.hxx" -#endif +#include MYTHES_H_LOCATION #include "frontends/alert.h" diff --git a/src/support/Makefile.am b/src/support/Makefile.am index a63bb89681..f3a8823ac6 100644 --- a/src/support/Makefile.am +++ b/src/support/Makefile.am @@ -113,12 +113,6 @@ liblyxsupport_a_SOURCES = \ unicode.cpp \ unicode.h \ weighted_btree.h -if USE_INCLUDED_MYTHES -liblyxsupport_a_SOURCES += \ - mythes/mythes.cxx \ - mythes/mythes.hxx \ - mythes/license.readme -endif #if INSTALL_MACOSX #liblyxsupport_a_SOURCES += \ diff --git a/src/support/mythes/license.readme b/src/support/mythes/license.readme deleted file mode 100644 index b6bf70a0c7..0000000000 --- a/src/support/mythes/license.readme +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright 2003 Kevin B. Hendricks, Stratford, Ontario, Canada - * And Contributors. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. All modifications to the source code must be clearly marked as - * such. Binary redistributions based on modified source code - * must be clearly marked as modified versions in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL - * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - */ diff --git a/src/support/mythes/mythes.cxx b/src/support/mythes/mythes.cxx deleted file mode 100644 index c530580384..0000000000 --- a/src/support/mythes/mythes.cxx +++ /dev/null @@ -1,365 +0,0 @@ -#include "license.readme" -#include -#include -#include -#include - -#include "mythes.hxx" - -// some basic utility routines - - -// string duplication routine -char * mythesstrdup(const char * p) -{ - - int sl = strlen(p) + 1; - char * d = (char *)malloc(sl); - if (d) { - memcpy(d,p,sl); - return d; - } - return NULL; -} - - -// return index of char in string -int mystr_indexOfChar(const char * d, int c) -{ - const char * p = strchr(d,c); - if (p) return (int)(p-d); - return -1; -} - - -// remove cross-platform text line end characters -void mytheschomp(char * s) -{ - int k = strlen(s); - if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0'; - if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; -} - - - -MyThes::MyThes(const char* idxpath, const char * datpath) -{ - nw = 0; - encoding = NULL; - list = NULL; - offst = NULL; - - if (thInitialize(idxpath, datpath) != 1) { - fprintf(stderr,"Error - can't open %s or %s\n",idxpath, datpath); - fflush(stderr); - if (encoding) free((void*)encoding); - if (list) free((void*)list); - if (offst) free((void*)offst); - // did not initialize properly - throw exception? - } -} - - -MyThes::~MyThes() -{ - if (thCleanup() != 1) { - /* did not cleanup properly - throw exception? */ - } - if (encoding) free((void*)encoding); - encoding = NULL; - list = NULL; - offst = NULL; -} - - -int MyThes::thInitialize(const char* idxpath, const char* datpath) -{ - - // open the index file - FILE * pifile = fopen(idxpath,"r"); - if (!pifile) { - pifile = NULL; - return 0; - } - - // parse in encoding and index size */ - char * wrd; - wrd = (char *)calloc(1, MAX_WD_LEN); - int len = readLine(pifile,wrd,MAX_WD_LEN); - encoding = mythesstrdup(wrd); - len = readLine(pifile,wrd,MAX_WD_LEN); - int idxsz = atoi(wrd); - - - // now allocate list, offst for the given size - list = (char**) calloc(idxsz,sizeof(char*)); - offst = (unsigned int*) calloc(idxsz,sizeof(unsigned int)); - - if ( (!(list)) || (!(offst)) ) { - fprintf(stderr,"Error - bad memory allocation\n"); - fflush(stderr); - return 0; - } - - // now parse the remaining lines of the index - len = readLine(pifile,wrd,MAX_WD_LEN); - while (len > 0) - { - int np = mystr_indexOfChar(wrd,'|'); - if (nw < idxsz) { - if (np >= 0) { - *(wrd+np) = '\0'; - list[nw] = (char *)calloc(1,(np+1)); - memcpy((list[nw]),wrd,np); - offst[nw] = atoi(wrd+np+1); - nw++; - } - } - len = readLine(pifile,wrd,MAX_WD_LEN); - } - - free((void *)wrd); - fclose(pifile); - pifile=NULL; - - /* next open the data file */ - pdfile = fopen(datpath,"r"); - if (!pdfile) { - pdfile = NULL; - return 0; - } - - return 1; -} - - -int MyThes::thCleanup() -{ - /* first close the data file */ - if (pdfile) { - fclose(pdfile); - pdfile=NULL; - } - - /* now free up all the allocated strings on the list */ - for (int i=0; i < nw; i++) - { - if (list[i]) { - free(list[i]); - list[i] = 0; - } - } - - if (list) free((void*)list); - if (offst) free((void*)offst); - - nw = 0; - return 1; -} - - - -// lookup text in index and count of meanings and a list of meaning entries -// with each entry having a synonym count and pointer to an -// array of char * (i.e the synonyms) -// -// note: calling routine should call CleanUpAfterLookup with the original -// meaning point and count to properly deallocate memory - -int MyThes::Lookup(const char * pText, int len, mentry** pme) -{ - - *pme = NULL; - - // handle the case of missing file or file related errors - if (! pdfile) return 0; - - long offset = 0; - - /* copy search word and make sure null terminated */ - char * wrd = (char *) calloc(1,(len+1)); - memcpy(wrd,pText,len); - - /* find it in the list */ - int idx = binsearch(wrd,list,nw); - free(wrd); - if (idx < 0) return 0; - - // now seek to the offset - offset = (long) offst[idx]; - int rc = fseek(pdfile,offset,SEEK_SET); - if (rc) { - return 0; - } - - // grab the count of the number of meanings - // and allocate a list of meaning entries - char * buf = NULL; - buf = (char *) malloc( MAX_LN_LEN ); - if (!buf) return 0; - readLine(pdfile, buf, (MAX_LN_LEN-1)); - int np = mystr_indexOfChar(buf,'|'); - if (np < 0) { - free(buf); - return 0; - } - int nmeanings = atoi(buf+np+1); - *pme = (mentry*) malloc( nmeanings * sizeof(mentry) ); - if (!(*pme)) { - free(buf); - return 0; - } - - // now read in each meaning and parse it to get defn, count and synonym lists - mentry* pm = *(pme); - char dfn[MAX_WD_LEN]; - - for (int j = 0; j < nmeanings; j++) { - readLine(pdfile, buf, (MAX_LN_LEN-1)); - - pm->count = 0; - pm->psyns = NULL; - pm->defn = NULL; - - // store away the part of speech for later use - char * p = buf; - char * pos = NULL; - np = mystr_indexOfChar(p,'|'); - if (np >= 0) { - *(buf+np) = '\0'; - pos = mythesstrdup(p); - p = p + np + 1; - } else { - pos = mythesstrdup(""); - } - - // count the number of fields in the remaining line - int nf = 1; - char * d = p; - np = mystr_indexOfChar(d,'|'); - while ( np >= 0 ) { - nf++; - d = d + np + 1; - np = mystr_indexOfChar(d,'|'); - } - pm->count = nf; - pm->psyns = (char **) malloc(nf*sizeof(char*)); - - // fill in the synonym list - d = p; - for (int j = 0; j < nf; j++) { - np = mystr_indexOfChar(d,'|'); - if (np > 0) { - *(d+np) = '\0'; - pm->psyns[j] = mythesstrdup(d); - d = d + np + 1; - } else { - pm->psyns[j] = mythesstrdup(d); - } - } - - // add pos to first synonym to create the definition - int k = strlen(pos); - int m = strlen(pm->psyns[0]); - if ((k+m) < (MAX_WD_LEN - 1)) { - strncpy(dfn,pos,k); - *(dfn+k) = ' '; - strncpy((dfn+k+1),(pm->psyns[0]),m+1); - pm->defn = mythesstrdup(dfn); - } else { - pm->defn = mythesstrdup(pm->psyns[0]); - } - free(pos); - pm++; - - } - free(buf); - - return nmeanings; -} - - - -void MyThes::CleanUpAfterLookup(mentry ** pme, int nmeanings) -{ - - if (nmeanings == 0) return; - if ((*pme) == NULL) return; - - mentry * pm = *pme; - - for (int i = 0; i < nmeanings; i++) { - int count = pm->count; - for (int j = 0; j < count; j++) { - if (pm->psyns[j]) free(pm->psyns[j]); - pm->psyns[j] = NULL; - } - if (pm->psyns) free(pm->psyns); - pm->psyns = NULL; - if (pm->defn) free(pm->defn); - pm->defn = NULL; - pm->count = 0; - pm++; - } - pm = *pme; - free(pm); - *pme = NULL; - return; -} - - -// read a line of text from a text file stripping -// off the line terminator and replacing it with -// a null string terminator. -// returns: -1 on error or the number of characters in -// in the returning string - -// A maximum of nc characters will be returned - -int MyThes::readLine(FILE * pf, char * buf, int nc) -{ - - if (fgets(buf,nc,pf)) { - mytheschomp(buf); - return strlen(buf); - } - return -1; -} - - - -// performs a binary search on null terminated character -// strings -// -// returns: -1 on not found -// index of wrd in the list[] - -int MyThes::binsearch(char * sw, char* list[], int nlst) -{ - int lp, up, mp, j, indx; - lp = 0; - up = nlst-1; - indx = -1; - if (strcmp(sw,list[lp]) < 0) return -1; - if (strcmp(sw,list[up]) > 0) return -1; - while (indx < 0 ) { - mp = (int)((lp+up) >> 1); - j = strcmp(sw,list[mp]); - if ( j > 0) { - lp = mp + 1; - } else if (j < 0 ) { - up = mp - 1; - } else { - indx = mp; - } - if (lp > up) return -1; - } - return indx; -} - -char * MyThes::get_th_encoding() -{ - if (encoding) return encoding; - return NULL; -} - diff --git a/src/support/mythes/mythes.hxx b/src/support/mythes/mythes.hxx deleted file mode 100644 index 489481b3d7..0000000000 --- a/src/support/mythes/mythes.hxx +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef _MYTHES_HXX_ -#define _MYTHES_HXX_ - -// some maximum sizes for buffers -#define MAX_WD_LEN 200 -#define MAX_LN_LEN 16384 - - -// a meaning with definition, count of synonyms and synonym list -struct mentry { - char* defn; - int count; - char** psyns; -}; - - -class MyThes -{ - - int nw; /* number of entries in thesaurus */ - char** list; /* stores word list */ - unsigned int* offst; /* stores offset list */ - char * encoding; /* stores text encoding; */ - - FILE *pdfile; - - // disallow copy-constructor and assignment-operator for now - MyThes(); - MyThes(const MyThes &); - MyThes & operator = (const MyThes &); - -public: - MyThes(const char* idxpath, const char* datpath); - ~MyThes(); - - // lookup text in index and return number of meanings - // each meaning entry has a defintion, synonym count and pointer - // when complete return the *original* meaning entry and count via - // CleanUpAfterLookup to properly handle memory deallocation - - int Lookup(const char * pText, int len, mentry** pme); - - void CleanUpAfterLookup(mentry** pme, int nmean); - - char* get_th_encoding(); - -private: - // Open index and dat files and load list array - int thInitialize (const char* indxpath, const char* datpath); - - // internal close and cleanup dat and idx files - int thCleanup (); - - // read a text line (\n terminated) stripping off line terminator - int readLine(FILE * pf, char * buf, int nc); - - // binary search on null terminated character strings - int binsearch(char * wrd, char* list[], int nlst); - -}; - -#endif - - - - -