/[smeserver]/rpms/aspell-nl/sme7/ispellaff2myspell
ViewVC logotype

Contents of /rpms/aspell-nl/sme7/ispellaff2myspell

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (show annotations) (download)
Sun Mar 2 02:43:09 2008 UTC (16 years, 10 months ago) by slords
Branch: MAIN
CVS Tags: aspell-nl-0_1e-1_el4_sme, HEAD
Import on branch sme7 of package aspell-nl-0.1e-1.el4.sme.src.rpm

1 #!/usr/bin/perl -w
2 # -*- coding: iso-8859-1 -*-
3 # $Id: ispellaff2myspell,v 1.1 2006/08/17 14:41:12 varekova Exp $
4 #
5 # (C) 2002-2005 Agustin Martin Domingo <agustin.martin@hispalinux.es>
6 #
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
11 #
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
16 #
17 # You should have received a copy of the GNU General Public License
18 # along with this program; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20
21
22 sub usage {
23 print "ispellaff2myspell: A program to convert ispell affix tables to myspell format
24 (C) 2002-2005 Agustin Martin Domingo <agustin.martin\@hispalinux.es> License: GPL
25
26 Usage:
27 ispellaff2myspell [options] <affixfile>
28
29 Options:
30 --affixfile=s Affix file
31 --bylocale Use current locale setup for upper/lowercase
32 conversion
33 --charset=s Use specified charset for upper/lowercase
34 conversion (defaults to latin1)
35 --debug Print debugging info
36 --extraflags Allow some non alphabetic flags
37 --lowercase=s Lowercase string
38 --myheader=s Header file
39 --printcomments Print commented lines in output
40 --replacements=s Replacements file
41 --split=i Split flags with more that i entries
42 --uppercase=s Uppercase string
43 --wordlist=s Still unused
44
45 Currently allowed valued for charset are: latin1, latin2, latin3
46
47 This script does not create the dict file. Something like
48
49 ( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict
50
51 should do the work, with mydict.words+ being the ispell munched wordlist
52
53 ";
54 exit;
55 }
56
57 sub debugprint {
58 if ( $debug ){
59 print STDERR "@_";
60 }
61 }
62
63 sub shipoutflag{
64 my $flag_entries=scalar @flag_array;
65
66 if ( $flag_entries != 0 ){
67 if ( $split ){
68 while ( @flag_array ){
69 my @flag_subarray=splice(@flag_array,0,$split);
70 my $subflag_entries=scalar @flag_subarray;
71 if ( scalar @flag_array ){
72 print "$myaffix $flagname $flagcombine $subflag_entries S\n";
73 } else {
74 print "$myaffix $flagname $flagcombine $subflag_entries\n";
75 }
76 print join("\n",@flag_subarray);
77 print "\n\n";
78 }
79 } else {
80 print "$myaffix $flagname $flagcombine $flag_entries\n";
81 print join("\n",@flag_array);
82 print "\n\n";
83 }
84 }
85 @flag_array=();
86 $flagname='';
87 $flagcombine='';
88 }
89
90 sub mylc{
91 my $inputstring=shift;
92 my $outputstring;
93
94 if ( $bylocale ){
95 {
96 use locale;
97 $outputstring = lc $inputstring;
98 }
99 } else {
100 if ( $charset eq "latin0" ){
101 $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ½¨¸';
102 $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ¼¦´';
103 } elsif ( $charset eq "latin1" ){
104 $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ';
105 $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ';
106 } elsif ( $charset eq "latin2" ){
107 $lowercase='a-z±³µ¶¹º»¼¾¿àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ';
108 $uppercase='A-Z¡£¥¦©ª«¬®¯ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ';
109 } elsif ( $charset eq "latin3" ){
110 $lowercase='a-z±¶¹º»¼¿àáâäåæçèéêëìíîïñòóôõö÷øùúûüýþ';
111 $uppercase='A-Z¡¦©ª«¬¯ÀÁÂÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖ×ØÙÚÛÜÝÞ';
112 # } elsif ( $charset eq "other_charset" ){
113 # die "latin2 still unimplemented";
114 } else {
115 if ( not $lowercase and not $uppercase ){
116 die "Unsupported charset [$charset]
117
118 use explicitely --lowercase=string and --uppercase=string
119 options. Remember that both string must match exactly, but
120 case changed.
121 ";
122 }
123 }
124 $outputstring=$inputstring;
125 eval "\$outputstring=~tr/$uppercase/$lowercase/";
126 }
127 return $outputstring;
128 }
129
130 sub validate_flag (){
131 my $flag = shift;
132 if ($flag=~m/[a-zA-Z]+/){
133 return $flag;
134 } elsif ( $hasextraflags ){
135 foreach ( keys %theextraflags ){
136 if ($flag =~ m/^$_/){
137 $flag =~ s/^$_//;
138 return $flag;
139 }
140 }
141 }
142 return '';
143 }
144
145 sub process_replacements{
146 my $file = shift;
147 my @replaces = ();
148
149 open (REPLACE,"< $file") ||
150 die "Error: Could not open replacements file: $file\n";
151 while (<REPLACE>){
152 next unless m/^REP[\s\t]*\D.*/;
153 next if m/^REP\s+[0-9]+/;
154 s/\015\012//;
155 s/\015//;
156 chomp;
157 push @replaces, $_;
158 }
159 close REPLACE;
160 my $number = scalar @replaces;
161 print "REP $number\n";
162 foreach ( @replaces ){
163 print $_ . "\n";
164 }
165 }
166
167 # -----------------------------------------------------------
168 # Now the progran start, after the functions are defined
169 # -----------------------------------------------------------
170
171 use Getopt::Long;
172
173 # Initializing option values
174 $affixfile = '';
175 $bylocale = '';
176 $charset = '';
177 $debug = '';
178 $lowercase = '';
179 $myheader = '';
180 $printcomments = '';
181 $replacements = '';
182 $split = '';
183 $uppercase = '';
184 $wordlist = '';
185 $hasextraflags = '';
186 @flag_array = ();
187 %theextraflags = ();
188 # Initializing root values
189 $rootremove = "0";
190 $rootname = '';
191 $addtoroot = '';
192 $comment = '';
193 # Initializing flag values
194 $flagname = '';
195 $flagcombine = '';
196 $inflags = '';
197
198 GetOptions ('affixfile=s' => \$affixfile,
199 'bylocale' => \$bylocale,
200 'charset=s' => \$charset,
201 'debug' => \$debug,
202 'extraflags:s' => sub {
203 $hasextraflags = 1;
204 shift;
205 $theflag = shift;
206 $theextraflags{$theflag}++ if $theflag},
207 'lowercase=s' => \$lowercase,
208 'myheader=s' => \$myheader,
209 'printcomments' => \$printcomments,
210 'replacements=s'=> \$replacements,
211 'split=i' => \$split,
212 'uppercase=s' => \$uppercase,
213 'wordlist=s' => \$wordlist) or usage;
214
215 if ( not $affixfile ){
216 $affixfile=shift or usage;
217 }
218
219 if ( $charset and ( $lowercase or $uppercase )){
220 die "Error: charset and lowercase/uppercase options
221 are incompatible. Use either charset or lowercase/uppercase options to
222 specify the patterns
223 "
224 } elsif ( not $lowercase and not $uppercase and not $charset ){
225 $charset="latin1";
226 }
227
228 if ( scalar(keys %theextraflags) == 0 && $hasextraflags ){
229 $theextraflags{"\\\\"}++;
230 }
231
232 debugprint "$affixfile $charset";
233
234 open (AFFIXFILE,"< $affixfile") ||
235 die "Error: Could not open affix file: $affixfile";
236
237 if ( $myheader ){
238 my $myspell_header=`cat $myheader`;
239 print $myspell_header . "\n";
240 }
241
242 while (<AFFIXFILE>){
243 chomp;
244 if (/^\s*\#.*/){
245 debugprint "Ignoring line $.\n";
246 print "$_\n" if $printcomments;
247 } elsif (/^\s*$/){
248 debugprint "Ignoring line $.\n";
249 } elsif (/^\s*prefixes/){
250 debugprint "Prefixes starting in line $.\n";
251 $affix="PFX";
252 } elsif (/^\s*suffixes/){
253 debugprint "Suffixes starting in line $.\n";
254 $affix="SFX";
255 } elsif (/^[\s\t]*flag.*/){
256 next if not $affix; # In case we are still in the preamble
257 shipoutflag if $inflags;
258 $inflags="yes";
259 s/^[\s\t]*flag[\s\t]*//;
260 s/[\s\t]*:.*$//;
261 debugprint "Found flag $_ in line $.\n";
262
263 if (/\*/){
264 s/[\*\s]//g;
265 $flagcombine="Y";
266 debugprint "Flag renamed to $_ with combine=$flagcombine\n";
267 } else {
268 $flagcombine="N";
269 }
270
271 if ( $flagname = &validate_flag($_) ){
272 $myaffix = $affix;
273 } else {
274 $myaffix = "\# $affix";
275 $flagname = $_;
276 print STDERR "Ignoring invalid flag $flagname in line $.\n";
277 }
278 } elsif ( $affix and $inflags ) {
279 ($rootname,@comments) = split('#',$_);
280 $comment = '# ' . join('#',@comments);
281
282 $rootname =~ s/\s*//g;
283 $rootname = mylc $rootname;
284 ($rootname,$addtoroot) = split('>',$rootname);
285
286 if ( $addtoroot =~ s/^\-//g ){
287 ($rootremove,$addtoroot) = split(',',$addtoroot);
288 $addtoroot = "0" unless $addtoroot;
289 $addtoroot = "0" if ( $addtoroot eq "-");
290 } else {
291 $rootremove = "0";
292 }
293 $addtoroot =~ s/\\\-/\-/g; # prefix ANTI\- to anti-
294
295 if ( $rootname eq '.' && $rootremove ne "0" ){
296 $rootname = $rootremove;
297 }
298
299 debugprint "$rootname, $addtoroot, $rootremove\n";
300 if ( $printcomments ){
301 $affix_line=sprintf("%s %s %-5s %-11s %-24s %s",
302 $myaffix, $flagname, $rootremove,
303 $addtoroot, $rootname, $comment);
304 } else {
305 $affix_line=sprintf("%s %s %-5s %-11s %s",
306 $myaffix, $flagname, $rootremove,
307 $addtoroot, $rootname);
308 }
309 $rootremove = "0";
310 $rootname = '';
311 $addtoroot = '';
312 $comment = '';
313 @comments = ();
314 push @flag_array,$affix_line;
315 debugprint "$affix_line\n";
316 } else {
317 #
318 }
319 }
320 shipoutflag;
321
322 close AFFIXFILE;
323
324 if ( $replacements ){
325 &process_replacements($replacements);
326 }
327
328 __END__
329
330 =head1 NAME
331
332 B<ispellaff2myspell> - A program to convert ispell affix tables to myspell format.
333
334 =head1 SYNOPSIS
335
336 ispellaff2myspell [options] <affixfile> --myheader your_header
337
338 Options:
339
340 --affixfile=s Affix file
341 --bylocale Use current locale setup for upper/lowercase
342 conversion
343 --charset=s Use specified charset for upper/lowercase
344 conversion (defaults to latin1)
345 --debug Print debugging info
346 --extraflags=s Allow some non alphabetic flags
347 --lowercase=s Lowercase string
348 --myheader=s Header file
349 --printcomments Print commented lines in output
350 --replacements=s Replacements file
351 --split=i Split flags with more that i entries
352 --uppercase=s Uppercase string
353
354 =head1 DESCRIPTION
355
356 B<ispellaff2myspell> is a script that will convert ispell affix tables
357 to myspell format in a more or less successful way.
358
359 This script does not create the dict file. Something like
360
361 ( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict
362
363 should do the work, with mydict.words+ being the munched wordlist
364
365 =head1 OPTIONS
366
367 =over 8
368
369 =item B<--affixfile=s>
370
371 Affix file. You can put it directly in the command line.
372
373 =item B<--bylocale>
374
375 Use current locale setup for upper/lowercase conversion. Make sure
376 that the selected locale match the dictionary one, or you might get
377 into trouble.
378
379 =item B<--charset=s>
380
381 Use specified charset for upper/lowercase conversion (defaults to latin1).
382 Currently allowed values for charset are: latin0, latin1, latin2, latin3.
383
384 =item B<--debug>
385
386 Print some debugging info.
387
388 =item B<--extraflags:s>
389
390 Allows some non alphabetic flags.
391
392 When invoked with no value the supported flags are currently those
393 corresponding to chars represented with the escape char B<\> as
394 first char. B<\> will be stripped.
395
396 When given with the flag prefix will allow that flag and strip the
397 given prefix. Be careful when giving the prefix to properly escape chars,
398 e.g. you will need B<-e "\\\\"> or B<-e '\\'> for flags like B<\[> to be stripped to
399 B<[>. Otherwise you might even get errors. Use B<-e "^"> to allow all
400 flags and pass them unmodified.
401
402 You will need a call to -e for each flag type, e.g.,
403 B<-e "\\\\" -e "~\\\\"> (or B<-e '\\' -e '~\\'>).
404
405 When a prefix is explicitely set, the default value (anything starting by B<\>)
406 is disabled and you need to enable it explicitely as in previous example.
407
408 =item B<--lowercase=s>
409
410 Lowercase string. Manually set the string of lowercase chars. This
411 requires B<--uppercase> having exactly that string but uppercase.
412
413 =item B<--myheader=s>
414
415 Header file. The myspell aff header. You need to write it
416 manually. This can contain everything you want to be before the affix table
417
418 =item B<--printcomments>
419
420 Print commented lines in output.
421
422 =item B<--replacements=file>
423
424 Add a pre-defined replacements table taken from 'file' to the .aff file.
425 Will skip lines not beginning with REP, and set the replacements number
426 appropriately.
427
428 =item B<--split=i>
429
430 Split flags with more that i entries. This can be of interest for flags
431 having a lot of entries. Will split the flag in chunks containing B<i>
432 entries.
433
434 =item B<--uppercase=s>
435
436 Uppercase string. Manually set the sring of uppercase chars. This
437 requires B<--lowercase> having exactly that string but lowercase.
438
439 =back
440
441 If your encoding is currently unsupported you can send me a file with
442 the two strings of lower and uppercase chars. Note that they must match
443 exactly but case changed. It will look something like
444
445 $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ';
446 $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ';
447
448 =head1 SEE ALSO
449
450 The OpenOffice.org Lingucomponent Project home page
451
452 L<http://lingucomponent.openoffice.org/index.html>
453
454 and the document
455
456 L<http://lingucomponent.openoffice.org/affix.readme>
457
458 that provides information about the basics of the myspell affix file format.
459
460 You can also take a look at
461
462 /usr/share/doc/libmyspell-dev/affix.readme.gz
463 /usr/share/doc/libmyspell-dev/README.compoundwords
464 /usr/share/doc/libmyspell-dev/README.replacetable
465
466 in your Debian system.
467
468 =head1 AUTHORS
469
470 Agustin Martin <agustin.martin@hispalinux.es>
471
472 =cut

admin@koozali.org
ViewVC Help
Powered by ViewVC 1.2.1 RSS 2.0 feed