/[smeserver]/rpms/aspell-nl/sme7/ispellaff2myspell
ViewVC logotype

Annotation of /rpms/aspell-nl/sme7/ispellaff2myspell

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Sun Mar 2 02:43:09 2008 UTC (16 years, 9 months ago) by slords
Branch: MAIN
CVS Tags: aspell-nl-0_1e-1_el4_sme, HEAD
Import on branch sme7 of package aspell-nl-0.1e-1.el4.sme.src.rpm

1 slords 1.1 #!/usr/bin/perl -w
2     # -*- coding: iso-8859-1 -*-
3     # $Id: ispellaff2myspell,v 1.1 2006/08/17 14:41:12 varekova Exp $
4     #
5     # (C) 2002-2005 Agustin Martin Domingo <agustin.martin@hispalinux.es>
6     #
7     # This program is free software; you can redistribute it and/or modify
8     # it under the terms of the GNU General Public License as published by
9     # the Free Software Foundation; either version 2 of the License, or
10     # (at your option) any later version.
11     #
12     # This program is distributed in the hope that it will be useful,
13     # but WITHOUT ANY WARRANTY; without even the implied warranty of
14     # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15     # GNU General Public License for more details.
16     #
17     # You should have received a copy of the GNU General Public License
18     # along with this program; if not, write to the Free Software
19     # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20    
21    
22     sub usage {
23     print "ispellaff2myspell: A program to convert ispell affix tables to myspell format
24     (C) 2002-2005 Agustin Martin Domingo <agustin.martin\@hispalinux.es> License: GPL
25    
26     Usage:
27     ispellaff2myspell [options] <affixfile>
28    
29     Options:
30     --affixfile=s Affix file
31     --bylocale Use current locale setup for upper/lowercase
32     conversion
33     --charset=s Use specified charset for upper/lowercase
34     conversion (defaults to latin1)
35     --debug Print debugging info
36     --extraflags Allow some non alphabetic flags
37     --lowercase=s Lowercase string
38     --myheader=s Header file
39     --printcomments Print commented lines in output
40     --replacements=s Replacements file
41     --split=i Split flags with more that i entries
42     --uppercase=s Uppercase string
43     --wordlist=s Still unused
44    
45     Currently allowed valued for charset are: latin1, latin2, latin3
46    
47     This script does not create the dict file. Something like
48    
49     ( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict
50    
51     should do the work, with mydict.words+ being the ispell munched wordlist
52    
53     ";
54     exit;
55     }
56    
57     sub debugprint {
58     if ( $debug ){
59     print STDERR "@_";
60     }
61     }
62    
63     sub shipoutflag{
64     my $flag_entries=scalar @flag_array;
65    
66     if ( $flag_entries != 0 ){
67     if ( $split ){
68     while ( @flag_array ){
69     my @flag_subarray=splice(@flag_array,0,$split);
70     my $subflag_entries=scalar @flag_subarray;
71     if ( scalar @flag_array ){
72     print "$myaffix $flagname $flagcombine $subflag_entries S\n";
73     } else {
74     print "$myaffix $flagname $flagcombine $subflag_entries\n";
75     }
76     print join("\n",@flag_subarray);
77     print "\n\n";
78     }
79     } else {
80     print "$myaffix $flagname $flagcombine $flag_entries\n";
81     print join("\n",@flag_array);
82     print "\n\n";
83     }
84     }
85     @flag_array=();
86     $flagname='';
87     $flagcombine='';
88     }
89    
90     sub mylc{
91     my $inputstring=shift;
92     my $outputstring;
93    
94     if ( $bylocale ){
95     {
96     use locale;
97     $outputstring = lc $inputstring;
98     }
99     } else {
100     if ( $charset eq "latin0" ){
101     $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ½¨¸';
102     $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ¼¦´';
103     } elsif ( $charset eq "latin1" ){
104     $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ';
105     $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ';
106     } elsif ( $charset eq "latin2" ){
107     $lowercase='a-z±³µ¶¹º»¼¾¿àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ';
108     $uppercase='A-Z¡£¥¦©ª«¬®¯ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ';
109     } elsif ( $charset eq "latin3" ){
110     $lowercase='a-z±¶¹º»¼¿àáâäåæçèéêëìíîïñòóôõö÷øùúûüýþ';
111     $uppercase='A-Z¡¦©ª«¬¯ÀÁÂÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖ×ØÙÚÛÜÝÞ';
112     # } elsif ( $charset eq "other_charset" ){
113     # die "latin2 still unimplemented";
114     } else {
115     if ( not $lowercase and not $uppercase ){
116     die "Unsupported charset [$charset]
117    
118     use explicitely --lowercase=string and --uppercase=string
119     options. Remember that both string must match exactly, but
120     case changed.
121     ";
122     }
123     }
124     $outputstring=$inputstring;
125     eval "\$outputstring=~tr/$uppercase/$lowercase/";
126     }
127     return $outputstring;
128     }
129    
130     sub validate_flag (){
131     my $flag = shift;
132     if ($flag=~m/[a-zA-Z]+/){
133     return $flag;
134     } elsif ( $hasextraflags ){
135     foreach ( keys %theextraflags ){
136     if ($flag =~ m/^$_/){
137     $flag =~ s/^$_//;
138     return $flag;
139     }
140     }
141     }
142     return '';
143     }
144    
145     sub process_replacements{
146     my $file = shift;
147     my @replaces = ();
148    
149     open (REPLACE,"< $file") ||
150     die "Error: Could not open replacements file: $file\n";
151     while (<REPLACE>){
152     next unless m/^REP[\s\t]*\D.*/;
153     next if m/^REP\s+[0-9]+/;
154     s/\015\012//;
155     s/\015//;
156     chomp;
157     push @replaces, $_;
158     }
159     close REPLACE;
160     my $number = scalar @replaces;
161     print "REP $number\n";
162     foreach ( @replaces ){
163     print $_ . "\n";
164     }
165     }
166    
167     # -----------------------------------------------------------
168     # Now the progran start, after the functions are defined
169     # -----------------------------------------------------------
170    
171     use Getopt::Long;
172    
173     # Initializing option values
174     $affixfile = '';
175     $bylocale = '';
176     $charset = '';
177     $debug = '';
178     $lowercase = '';
179     $myheader = '';
180     $printcomments = '';
181     $replacements = '';
182     $split = '';
183     $uppercase = '';
184     $wordlist = '';
185     $hasextraflags = '';
186     @flag_array = ();
187     %theextraflags = ();
188     # Initializing root values
189     $rootremove = "0";
190     $rootname = '';
191     $addtoroot = '';
192     $comment = '';
193     # Initializing flag values
194     $flagname = '';
195     $flagcombine = '';
196     $inflags = '';
197    
198     GetOptions ('affixfile=s' => \$affixfile,
199     'bylocale' => \$bylocale,
200     'charset=s' => \$charset,
201     'debug' => \$debug,
202     'extraflags:s' => sub {
203     $hasextraflags = 1;
204     shift;
205     $theflag = shift;
206     $theextraflags{$theflag}++ if $theflag},
207     'lowercase=s' => \$lowercase,
208     'myheader=s' => \$myheader,
209     'printcomments' => \$printcomments,
210     'replacements=s'=> \$replacements,
211     'split=i' => \$split,
212     'uppercase=s' => \$uppercase,
213     'wordlist=s' => \$wordlist) or usage;
214    
215     if ( not $affixfile ){
216     $affixfile=shift or usage;
217     }
218    
219     if ( $charset and ( $lowercase or $uppercase )){
220     die "Error: charset and lowercase/uppercase options
221     are incompatible. Use either charset or lowercase/uppercase options to
222     specify the patterns
223     "
224     } elsif ( not $lowercase and not $uppercase and not $charset ){
225     $charset="latin1";
226     }
227    
228     if ( scalar(keys %theextraflags) == 0 && $hasextraflags ){
229     $theextraflags{"\\\\"}++;
230     }
231    
232     debugprint "$affixfile $charset";
233    
234     open (AFFIXFILE,"< $affixfile") ||
235     die "Error: Could not open affix file: $affixfile";
236    
237     if ( $myheader ){
238     my $myspell_header=`cat $myheader`;
239     print $myspell_header . "\n";
240     }
241    
242     while (<AFFIXFILE>){
243     chomp;
244     if (/^\s*\#.*/){
245     debugprint "Ignoring line $.\n";
246     print "$_\n" if $printcomments;
247     } elsif (/^\s*$/){
248     debugprint "Ignoring line $.\n";
249     } elsif (/^\s*prefixes/){
250     debugprint "Prefixes starting in line $.\n";
251     $affix="PFX";
252     } elsif (/^\s*suffixes/){
253     debugprint "Suffixes starting in line $.\n";
254     $affix="SFX";
255     } elsif (/^[\s\t]*flag.*/){
256     next if not $affix; # In case we are still in the preamble
257     shipoutflag if $inflags;
258     $inflags="yes";
259     s/^[\s\t]*flag[\s\t]*//;
260     s/[\s\t]*:.*$//;
261     debugprint "Found flag $_ in line $.\n";
262    
263     if (/\*/){
264     s/[\*\s]//g;
265     $flagcombine="Y";
266     debugprint "Flag renamed to $_ with combine=$flagcombine\n";
267     } else {
268     $flagcombine="N";
269     }
270    
271     if ( $flagname = &validate_flag($_) ){
272     $myaffix = $affix;
273     } else {
274     $myaffix = "\# $affix";
275     $flagname = $_;
276     print STDERR "Ignoring invalid flag $flagname in line $.\n";
277     }
278     } elsif ( $affix and $inflags ) {
279     ($rootname,@comments) = split('#',$_);
280     $comment = '# ' . join('#',@comments);
281    
282     $rootname =~ s/\s*//g;
283     $rootname = mylc $rootname;
284     ($rootname,$addtoroot) = split('>',$rootname);
285    
286     if ( $addtoroot =~ s/^\-//g ){
287     ($rootremove,$addtoroot) = split(',',$addtoroot);
288     $addtoroot = "0" unless $addtoroot;
289     $addtoroot = "0" if ( $addtoroot eq "-");
290     } else {
291     $rootremove = "0";
292     }
293     $addtoroot =~ s/\\\-/\-/g; # prefix ANTI\- to anti-
294    
295     if ( $rootname eq '.' && $rootremove ne "0" ){
296     $rootname = $rootremove;
297     }
298    
299     debugprint "$rootname, $addtoroot, $rootremove\n";
300     if ( $printcomments ){
301     $affix_line=sprintf("%s %s %-5s %-11s %-24s %s",
302     $myaffix, $flagname, $rootremove,
303     $addtoroot, $rootname, $comment);
304     } else {
305     $affix_line=sprintf("%s %s %-5s %-11s %s",
306     $myaffix, $flagname, $rootremove,
307     $addtoroot, $rootname);
308     }
309     $rootremove = "0";
310     $rootname = '';
311     $addtoroot = '';
312     $comment = '';
313     @comments = ();
314     push @flag_array,$affix_line;
315     debugprint "$affix_line\n";
316     } else {
317     #
318     }
319     }
320     shipoutflag;
321    
322     close AFFIXFILE;
323    
324     if ( $replacements ){
325     &process_replacements($replacements);
326     }
327    
328     __END__
329    
330     =head1 NAME
331    
332     B<ispellaff2myspell> - A program to convert ispell affix tables to myspell format.
333    
334     =head1 SYNOPSIS
335    
336     ispellaff2myspell [options] <affixfile> --myheader your_header
337    
338     Options:
339    
340     --affixfile=s Affix file
341     --bylocale Use current locale setup for upper/lowercase
342     conversion
343     --charset=s Use specified charset for upper/lowercase
344     conversion (defaults to latin1)
345     --debug Print debugging info
346     --extraflags=s Allow some non alphabetic flags
347     --lowercase=s Lowercase string
348     --myheader=s Header file
349     --printcomments Print commented lines in output
350     --replacements=s Replacements file
351     --split=i Split flags with more that i entries
352     --uppercase=s Uppercase string
353    
354     =head1 DESCRIPTION
355    
356     B<ispellaff2myspell> is a script that will convert ispell affix tables
357     to myspell format in a more or less successful way.
358    
359     This script does not create the dict file. Something like
360    
361     ( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict
362    
363     should do the work, with mydict.words+ being the munched wordlist
364    
365     =head1 OPTIONS
366    
367     =over 8
368    
369     =item B<--affixfile=s>
370    
371     Affix file. You can put it directly in the command line.
372    
373     =item B<--bylocale>
374    
375     Use current locale setup for upper/lowercase conversion. Make sure
376     that the selected locale match the dictionary one, or you might get
377     into trouble.
378    
379     =item B<--charset=s>
380    
381     Use specified charset for upper/lowercase conversion (defaults to latin1).
382     Currently allowed values for charset are: latin0, latin1, latin2, latin3.
383    
384     =item B<--debug>
385    
386     Print some debugging info.
387    
388     =item B<--extraflags:s>
389    
390     Allows some non alphabetic flags.
391    
392     When invoked with no value the supported flags are currently those
393     corresponding to chars represented with the escape char B<\> as
394     first char. B<\> will be stripped.
395    
396     When given with the flag prefix will allow that flag and strip the
397     given prefix. Be careful when giving the prefix to properly escape chars,
398     e.g. you will need B<-e "\\\\"> or B<-e '\\'> for flags like B<\[> to be stripped to
399     B<[>. Otherwise you might even get errors. Use B<-e "^"> to allow all
400     flags and pass them unmodified.
401    
402     You will need a call to -e for each flag type, e.g.,
403     B<-e "\\\\" -e "~\\\\"> (or B<-e '\\' -e '~\\'>).
404    
405     When a prefix is explicitely set, the default value (anything starting by B<\>)
406     is disabled and you need to enable it explicitely as in previous example.
407    
408     =item B<--lowercase=s>
409    
410     Lowercase string. Manually set the string of lowercase chars. This
411     requires B<--uppercase> having exactly that string but uppercase.
412    
413     =item B<--myheader=s>
414    
415     Header file. The myspell aff header. You need to write it
416     manually. This can contain everything you want to be before the affix table
417    
418     =item B<--printcomments>
419    
420     Print commented lines in output.
421    
422     =item B<--replacements=file>
423    
424     Add a pre-defined replacements table taken from 'file' to the .aff file.
425     Will skip lines not beginning with REP, and set the replacements number
426     appropriately.
427    
428     =item B<--split=i>
429    
430     Split flags with more that i entries. This can be of interest for flags
431     having a lot of entries. Will split the flag in chunks containing B<i>
432     entries.
433    
434     =item B<--uppercase=s>
435    
436     Uppercase string. Manually set the sring of uppercase chars. This
437     requires B<--lowercase> having exactly that string but lowercase.
438    
439     =back
440    
441     If your encoding is currently unsupported you can send me a file with
442     the two strings of lower and uppercase chars. Note that they must match
443     exactly but case changed. It will look something like
444    
445     $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ';
446     $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ';
447    
448     =head1 SEE ALSO
449    
450     The OpenOffice.org Lingucomponent Project home page
451    
452     L<http://lingucomponent.openoffice.org/index.html>
453    
454     and the document
455    
456     L<http://lingucomponent.openoffice.org/affix.readme>
457    
458     that provides information about the basics of the myspell affix file format.
459    
460     You can also take a look at
461    
462     /usr/share/doc/libmyspell-dev/affix.readme.gz
463     /usr/share/doc/libmyspell-dev/README.compoundwords
464     /usr/share/doc/libmyspell-dev/README.replacetable
465    
466     in your Debian system.
467    
468     =head1 AUTHORS
469    
470     Agustin Martin <agustin.martin@hispalinux.es>
471    
472     =cut

admin@koozali.org
ViewVC Help
Powered by ViewVC 1.2.1 RSS 2.0 feed