Language tools/checkutftable
Note that program is under GNU GPL, not under GNU FDL. If you contribute to this program, please, add your name inside of copyright notice.
I made this module for checking is the text on Serbian Wikipedia in Cyrillic or in Latin script. However, I realized that it can be useful for other languages, too. Characters which do not belong strictly to any script are described as 'not important'. If character is not recognized by unicodedata module, it is described as 'not known'.
#!/usr/bin/python
#
# Copyright (C) 2006 Milos Rancic
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
# Text of GNU GPL license can be found at http://www.gnu.org/licenses/gnu.html
import sys
import unicodedata
# engine
def check(f):
text = file(f).read().decode('utf-8');
numchar = {}
numchar['all'] = 0
numchar['ni'] = 0 #not important
numchar['nnnn'] = 0 #not known
numchar['ARAB'] = 0 #ARABIC
numchar['ARME'] = 0 #ARMENIAN
numchar['BENG'] = 0 #BENGALI
numchar['BOPO'] = 0 #BOPOMOFO
numchar['BRAI'] = 0 #BRAILLE PATTERN
numchar['BUHI'] = 0 #BUHID
numchar['CANA'] = 0 #CANADIAN
numchar['CHER'] = 0 #CHEROKEE
numchar['CJK '] = 0 #CJK
numchar['COMB'] = 0 #COMBINING
numchar['COPT'] = 0 #COPTIC
numchar['CYRI'] = 0 #CYRILLIC
numchar['DESE'] = 0 #DESERET
numchar['DEVA'] = 0 #DEVANGARI
numchar['ETHI'] = 0 #ETHIOPIC
numchar['EXTE'] = 0 #EXTENDED ARABIC
numchar['GEOR'] = 0 #GEORGIAN
numchar['GOTH'] = 0 #GOTHIC
numchar['GREE'] = 0 #GREEK
numchar['GUJA'] = 0 #GUJARATI
numchar['GURM'] = 0 #GURMUKHI
numchar['HANG'] = 0 #HANGUL
numchar['HANU'] = 0 #HANUNOO
numchar['HEBR'] = 0 #HEBREW
numchar['HIRA'] = 0 #HIRAGANA
numchar['IDEO'] = 0 #IDEOGRAPHIC
numchar['KANG'] = 0 #KANGXI RADICAL
numchar['KANN'] = 0 #KANNADA
numchar['KATA'] = 0 #KATAKANA
numchar['KHME'] = 0 #KHMER
numchar['LAO '] = 0 #LAO
numchar['LATI'] = 0 #LATIN
numchar['MALA'] = 0 #MALAYALAM
numchar['MODI'] = 0 #MODIFIER LETTER
numchar['MONG'] = 0 #MONGOLIAN
numchar['MYAN'] = 0 #MYANMAR
numchar['OGHA'] = 0 #OGHAM
numchar['OLD '] = 0 #OLD ITALIC
numchar['ORIY'] = 0 #ORIYA
numchar['PHIL'] = 0 #PHILIPPINE
numchar['RUNI'] = 0 #RUNIC
numchar['SINH'] = 0 #SINHALA
numchar['SYRI'] = 0 #SYRIAC
numchar['TAGA'] = 0 #TAGALOG
numchar['TAGB'] = 0 #TAGBANWA
numchar['TAMI'] = 0 #TAMIL
numchar['TELU'] = 0 #TELUGU
numchar['THAA'] = 0 #THAANA
numchar['THAI'] = 0 #THAI
numchar['TIBE'] = 0 #TIBETAN
numchar['YI R'] = 0 #YI RADICAL
numchar['YI S'] = 0 #YI SYLLABLE
for t in range(0,len(text)):
character = unicodedata.name(text[t],"nnnn")
if len(character) < 4:
character += ' '
char = character[0:4]
if char not in numchar:
char = 'ni'
numchar[char] += 1
numchar['all'] += 1
return numchar
# the list of existing character groups
def describe(d):
perc = {}
out = {}
for n in d:
if (d[n] > 0) and (n != 'all'):
perc[n] = float(d[n])/float(d['all'])
out[n] = [ n, perc[n] ]
return out
# what group of characters has absolute majority, what relative
def decide(d):
izbor = ''
perc = {}
ctrl = 0
for n in d:
if (d[n] > 0) and (n != 'all'):
perc[n] = float(d[n])/float(d['all'])
if perc[n] > ctrl:
ctrl = perc[n]
czbor = n
if perc[n] > 0.5:
izbor = n
if izbor == '':
out = [ 'r', czbor, ctrl ]
else:
out = [ 'a', izbor, perc[izbor] ]
return out
# for Serbian (Wikipedia): what is the script of the text
def analyzecyr(d):
izbor = ''
perc = {}
ctrl = 0
if 'CYRI' in d and 'LATI' in d:
d['cyrlat'] = d['CYRI'] + d['LATI']
perc['CYRI'] = float(d['CYRI'])/float(d['cyrlat'])
perc['LATI'] = float(d['LATI'])/float(d['cyrlat'])
if perc['CYRI'] > 0.5:
ra = 'a'
izbor = 'CYRI'
elif perc['CYRI'] > 0.3:
ra = 'r'
izbor = 'CYRI'
elif perc['LATI'] > 0.5:
ra = 'a'
izbor = 'LATI'
elif perc['LATI'] > 0.3:
ra = 'r'
izbor = 'LATI'
else:
ra = 'n'
izbor = 'n'
perc['n'] = 0
out = [ ra, izbor, perc[izbor] ]
return out
# if the program is started from the command line
try:
fl = sys.argv[1]
dictionary = check(fl)
print describe(dictionary)
print decide(dictionary)
print analyzecyr(dictionary)
except IndexError:
pass
Category:Language tools