genChRanges.py 15.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
#!/usr/bin/python -u
#
# Portions of this script have been (shamelessly) stolen from the
# prior work of Daniel Veillard (genUnicode.py)
#
# I, however, take full credit for any bugs, errors or difficulties :-)
#
# William Brack
# October 2003
#
11 12 13 14 15 16
# 18 October 2003
# Modified to maintain binary compatibility with previous library versions
# by adding a suffix 'Q' ('quick') to the macro generated for the original,
# function, and adding generation of a function (with the original name) which
# instantiates the macro.
#
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194

import sys
import string
import time

#
# A routine to take a list of yes/no (1, 0) values and turn it
# into a list of ranges.  This will later be used to determine whether
# to generate single-byte lookup tables, or inline comparisons
#
def makeRange(lst):
    ret = []
    pos = 0
    while pos < len(lst):
	try:		# index generates exception if not present
	    s = lst[pos:].index(1)	# look for start of next range
	except:
	    break			# if no more, finished
	pos += s			# pointer to start of possible range
	try:
	    e = lst[pos:].index(0)	# look for end of range
	    e += pos
	except:				# if no end, set to end of list
	    e = len(lst)
	ret.append((pos, e-1))		# append range tuple to list
	pos = e + 1			# ready to check for next range
    return ret

sources = "chvalid.def"			# input filename

# minTableSize gives the minimum number of ranges which must be present
# before a 256-byte lookup table is produced.  If there are less than this
# number, a macro with inline comparisons is generated
minTableSize = 6

# dictionary of functions, key=name, element contains char-map and range-list
Functs = {}

state = 0

try:
    defines = open("chvalid.def", "r")
except:
    print "Missing chvalid.def, aborting ..."
    sys.exit(1)

#
# The lines in the .def file have three types:-
#   name:   Defines a new function block
#   ur:	    Defines individual or ranges of unicode values
#   end:    Indicates the end of the function block
#
# These lines are processed below.
#
for line in defines.readlines():
    # ignore blank lines, or lines beginning with '#'
    if line[0] == '#':
        continue
    line = string.strip(line)
    if line == '':
        continue
    # split line into space-separated fields, then split on type
    try:
        fields = string.split(line, ' ')
	#
	# name line:
	#   validate any previous function block already ended
	#   validate this function not already defined
	#   initialize an entry in the function dicitonary
	#	including a mask table with no values yet defined
	#
	if fields[0] == 'name':
	    name = fields[1]
	    if state != 0:
		print "'name' %s found before previous name" \
		      "completed" % (fields[1])
		continue
	    state = 1
	    if Functs.has_key(name):
		print "name '%s' already present - may give" \
		      " wrong results" % (name)
	    else:
		# dict entry with two list elements (chdata, rangedata)
		Functs[name] = [ [], [] ]
		for v in range(256):
		    Functs[name][0].append(0)
	#
	# end line:
	#   validate there was a preceding function name line
	#   set state to show no current function active
	#
	elif fields[0] == 'end':
	    if state == 0:
		print "'end' found outside of function block"
		continue
	    state = 0

	#
	# ur line:
	#   validate function has been defined
	#   process remaining fields on the line, which may be either
	#	individual unicode values or ranges of values
	#
	elif fields[0] == 'ur':
	    if state != 1:
		raise ValidationError, "'ur' found outside of 'name' block"
	    for el in fields[1:]:
		pos = string.find(el, '..')
		# pos <=0 means not a range, so must be individual value
		if pos <= 0:
		    # cheap handling of hex or decimal values
		    if el[0:2] == '0x':
		        value = int(el[2:],16)
		    elif el[0] == "'":
			value = ord(el[1])
		    else:
			value = int(el)
		    if ((value < 0) | (value > 0x1fffff)):
			raise ValidationError, 'Illegal value (%s) in ch for'\
				' name %s' % (el,name)
		    # for ur we have only ranges (makes things simpler),
		    # so convert val to range
		    currange = (value, value)
		# pos > 0 means this is a range, so isolate/validate
		# the interval
		else:
		    # split the range into it's first-val, last-val
		    (first, last) = string.split(el, "..")
		    # convert values from text into binary
		    if first[0:2] == '0x':	
			start = int(first[2:],16)
		    elif first[0] == "'":
			start = ord(first[1])
		    else:
			start = int(first)
		    if last[0:2] == '0x':
			end = int(last[2:],16)
		    elif last[0] == "'":
			end = ord(last[1])
		    else:
			end = int(last)
		    if (start < 0) | (end > 0x1fffff) | (start > end):
			raise ValidationError, "Invalid range '%s'" % el
		    currange = (start, end)
		# common path - 'currange' has the range, now take care of it
		# We split on single-byte values vs. multibyte
		if currange[1] < 0x100:	# single-byte
		    for ch in range(currange[0],currange[1]+1):
			# validate that value not previously defined
			if Functs[name][0][ch]:
			    msg = "Duplicate ch value '%s' for name '%s'" % (el, name)
			    raise ValidationError, msg
			Functs[name][0][ch] = 1
		else:			# multi-byte
		    if currange in Functs[name][1]:
			raise ValidationError, "range already defined in" \
				" function"
		    else:
			Functs[name][1].append(currange)

    except:
	print "Failed to process line: %s" % (line)
	raise
#
# At this point, the entire definition file has been processed.  Now we
# enter the output phase, where we generate the two files chvalid.c and'
# chvalid.h
#
# To do this, we first output the 'static' data (heading, fixed
# definitions, etc.), then output the 'dynamic' data (the results
# of the above processing), and finally output closing 'static' data
# (e.g. the subroutine to process the ranges)
#

#
# Generate the headings:
#
try:
195
    header = open("include/libxml/chvalid.h", "w")
196
except:
197
    print "Failed to open include/libxml/chvalid.h"
198 199 200 201 202 203 204 205 206 207 208 209
    sys.exit(1)

try:
    output = open("chvalid.c", "w")
except:
    print "Failed to open chvalid.c"
    sys.exit(1)

date = time.asctime(time.localtime(time.time()))

header.write(
"""/*
210 211 212
 * Summary: Unicode character range checking
 * Description: this module exports interfaces for the character
 *               range validation APIs
213 214 215 216 217 218
 *
 * This file is automatically generated from the cvs source
 * definition files using the genChRanges.py Python script
 *
 * Generation date: %s
 * Sources: %s
219
 * Author: William Brack <wbrack@mmm.com.hk>
220 221 222 223 224
 */

#ifndef __XML_CHVALID_H__
#define __XML_CHVALID_H__

225
#include <libxml/xmlversion.h>
226
#include <libxml/xmlstring.h>
227

228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
#ifdef __cplusplus
extern "C" {
#endif

/*
 * Define our typedefs and structures
 *
 */
typedef struct _xmlChSRange xmlChSRange;
typedef xmlChSRange *xmlChSRangePtr;
struct _xmlChSRange {
    unsigned short	low;
    unsigned short	high;
};

typedef struct _xmlChLRange xmlChLRange;
typedef xmlChLRange *xmlChLRangePtr;
struct _xmlChLRange {
246 247
    unsigned int	low;
    unsigned int	high;
248 249 250 251 252 253 254
};

typedef struct _xmlChRangeGroup xmlChRangeGroup;
typedef xmlChRangeGroup *xmlChRangeGroupPtr;
struct _xmlChRangeGroup {
    int			nbShortRange;
    int			nbLongRange;
255 256
    const xmlChSRange	*shortRange;	/* points to an array of ranges */
    const xmlChLRange	*longRange;
257 258
};

259 260 261
/**
 * Range checking routine
 */
262
XMLPUBFUN int XMLCALL
263
		xmlCharInRange(unsigned int val, const xmlChRangeGroup *group);
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278

""" % (date, sources));
output.write(
"""/*
 * chvalid.c:	this module implements the character range
 *		validation APIs
 *
 * This file is automatically generated from the cvs source
 * definition files using the genChRanges.py Python script
 *
 * Generation date: %s
 * Sources: %s
 * William Brack <wbrack@mmm.com.hk>
 */

279 280
#define IN_LIBXML
#include "libxml.h"
281
#include <libxml/chvalid.h>
282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314

/*
 * The initial tables ({func_name}_tab) are used to validate whether a
 * single-byte character is within the specified group.  Each table
 * contains 256 bytes, with each byte representing one of the 256
 * possible characters.  If the table byte is set, the character is
 * allowed.
 *
 */
""" % (date, sources));

#
# Now output the generated data.
# We try to produce the best execution times.  Tests have shown that validation
# with direct table lookup is, when there are a "small" number of valid items,
# still not as fast as a sequence of inline compares.  So, if the single-byte
# portion of a range has a "small" number of ranges, we output a macro for inline
# compares, otherwise we output a 256-byte table and a macro to use it.
#

fkeys = Functs.keys()	# Dictionary of all defined functions
fkeys.sort()		# Put some order to our output

for f in fkeys:

# First we convert the specified single-byte values into a group of ranges.
# If the total number of such ranges is less than minTableSize, we generate
# an inline macro for direct comparisons; if greater, we generate a lookup
# table.
    if max(Functs[f][0]) > 0:	# only check if at least one entry
        rangeTable = makeRange(Functs[f][0])
	numRanges = len(rangeTable)
	if numRanges >= minTableSize:	# table is worthwhile
315
	    header.write("XMLPUBVAR const unsigned char %s_tab[256];\n" % f)
316 317 318 319 320 321 322 323
	    header.write("""
/**
 * %s_ch:
 * @c: char to validate
 *
 * Automatically generated by genChRanges.py
 */
""" % f)
324 325 326
	    header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f))

	    # write the constant data to the code file
327
	    output.write("const unsigned char %s_tab[256] = {\n" % f)
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
	    pline = "   "
	    for n in range(255):
		pline += " 0x%02x," % Functs[f][0][n]
		if len(pline) > 72:
		    output.write(pline + "\n")
		    pline = "   "
	    output.write(pline + " 0x%02x };\n\n" % Functs[f][0][255])

	else:		# inline check is used
	    # first another little optimisation - if space is present,
	    # put it at the front of the list so it is checked first
	    try:
		ix = rangeTable.remove((0x20, 0x20))
		rangeTable.insert(0, (0x20, 0x20))
	    except:
		pass
	    firstFlag = 1
345 346 347 348 349 350 351 352 353
	    
	    header.write("""
/**
 * %s_ch:
 * @c: char to validate
 *
 * Automatically generated by genChRanges.py
 */
""" % f)
354 355 356 357 358 359 360 361 362 363
	    # okay, I'm tired of the messy lineup - let's automate it!
	    pline = "#define %s_ch(c)" % f
	    # 'ntab' is number of tabs needed to position to col. 33 from name end
	    ntab = 4 - (len(pline)) / 8
	    if ntab < 0:
		ntab = 0
	    just = ""
	    for i in range(ntab):
		just += "\t"
	    pline = pline + just + "("
364 365
	    for rg in rangeTable:
		if not firstFlag:
366
		    pline += " || \\\n\t\t\t\t "
367 368 369
		else:
		    firstFlag = 0
		if rg[0] == rg[1]:		# single value - check equal
370 371
		    pline += "((c) == 0x%x)" % rg[0]
		else:				# value range
372 373 374 375 376 377
		# since we are doing char, also change range ending in 0xff
		    if rg[1] != 0xff:
		        pline += "((0x%x <= (c)) &&" % rg[0]
		        pline += " ((c) <= 0x%x))" % rg[1]
		    else:
		        pline += " (0x%x <= (c))" % rg[0]
378 379 380
	    pline += ")\n"
	    header.write(pline)

381 382 383 384 385 386 387 388
    header.write("""
/**
 * %sQ:
 * @c: char to validate
 *
 * Automatically generated by genChRanges.py
 */
""" % f)
389
    pline = "#define %sQ(c)" % f
390 391 392 393 394 395 396
    ntab = 4 - (len(pline)) / 8
    if ntab < 0:
	ntab = 0
    just = ""
    for i in range(ntab):
	just += "\t"
    header.write(pline + just + "(((c) < 0x100) ? \\\n\t\t\t\t ")
397 398 399 400 401 402
    if max(Functs[f][0]) > 0:
	header.write("%s_ch((c)) :" % f)
    else:
	header.write("0 :")

    # if no ranges defined, value invalid if >= 0x100
403 404
    numRanges = len(Functs[f][1])
    if numRanges == 0:
405 406
	header.write(" 0)\n\n")
    else:
407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
	if numRanges >= minTableSize:
	    header.write(" \\\n\t\t\t\t xmlCharInRange((c), &%sGroup))\n\n"  % f)
	else:		# if < minTableSize, generate inline code
	    firstFlag = 1
	    for rg in Functs[f][1]:
		if not firstFlag:
		    pline += " || \\\n\t\t\t\t "
		else:
		    firstFlag = 0
		    pline = "\\\n\t\t\t\t("
		if rg[0] == rg[1]:		# single value - check equal
		    pline += "((c) == 0x%x)" % rg[0]
		else:				# value range
		    pline += "((0x%x <= (c)) &&" % rg[0]
		    pline += " ((c) <= 0x%x))" % rg[1]
	    pline += "))\n\n"
	    header.write(pline)

425 426

    if len(Functs[f][1]) > 0:
427
	header.write("XMLPUBVAR const xmlChRangeGroup %sGroup;\n" % f)
428 429 430 431 432 433 434 435 436 437 438 439 440 441 442


#
# Next we do the unicode ranges
#

for f in fkeys:
    if len(Functs[f][1]) > 0:	# only generate if unicode ranges present
	rangeTable = Functs[f][1]
	rangeTable.sort()	# ascending tuple sequence
	numShort = 0
	numLong  = 0
	for rg in rangeTable:
	    if rg[1] < 0x10000:	# if short value
		if numShort == 0:	# first occurence
443
		    pline = "static const xmlChSRange %s_srng[] = { " % f
444 445 446 447 448 449 450 451 452 453 454
		else:
		    pline += ", "
		numShort += 1
		if len(pline) > 60:
		    output.write(pline + "\n")
		    pline = "    "
		pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
	    else:		# if long value
		if numLong == 0:	# first occurence
		    if numShort > 0:	# if there were shorts, finish them off
			output.write(pline + "};\n")
455
		    pline = "static const xmlChLRange %s_lrng[] = { " % f
456 457 458 459 460 461 462 463 464
		else:
		    pline += ", "
		numLong += 1
		if len(pline) > 60:
		    output.write(pline + "\n")
		    pline = "    "
		pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
	output.write(pline + "};\n")	# finish off last group

465
	pline = "const xmlChRangeGroup %sGroup =\n\t{%d, %d, " % (f, numShort, numLong)
466 467
	if numShort > 0:
	    pline += "%s_srng" % f
468 469
	else:
	    pline += "(xmlChSRangePtr)0"
470 471
	if numLong > 0:
	    pline += ", %s_lrng" % f
472 473
	else:
	    pline += ", (xmlChLRangePtr)0"
474 475 476 477 478
	
	output.write(pline + "};\n\n")

output.write(
"""
479 480 481 482 483 484 485 486 487 488
/**
 * xmlCharInRange:
 * @val: character to be validated
 * @rptr: pointer to range to be used to validate
 *
 * Does a binary search of the range table to determine if char
 * is valid
 *
 * Returns: true if character valid, false otherwise
 */
489
int
490
xmlCharInRange (unsigned int val, const xmlChRangeGroup *rptr) {
491
    int low, high, mid;
492 493
    const xmlChSRange *sptr;
    const xmlChLRange *lptr;
494 495

    if (rptr == NULL) return(0);
496 497 498 499
    if (val < 0x10000) {	/* is val in 'short' or 'long'  array? */
	if (rptr->nbShortRange == 0)
	    return 0;
	low = 0;
500
	high = rptr->nbShortRange - 1;
501 502 503
	sptr = rptr->shortRange;
	while (low <= high) {
	    mid = (low + high) / 2;
504
	    if ((unsigned short) val < sptr[mid].low) {
505
		high = mid - 1;
506 507 508 509 510 511 512
	    } else {
	        if ((unsigned short) val > sptr[mid].high) {
		    low = mid + 1;
		} else {
		    return 1;
		}
	    }
513 514
	}
    } else {
515
	if (rptr->nbLongRange == 0) {
516
	    return 0;
517
	}
518
	low = 0;
519
	high = rptr->nbLongRange - 1;
520 521 522
	lptr = rptr->longRange;
	while (low <= high) {
	    mid = (low + high) / 2;
523
	    if (val < lptr[mid].low) {
524
		high = mid - 1;
525 526 527 528 529 530 531
	    } else {
	        if (val > lptr[mid].high) {
		    low = mid + 1;
		} else {
		    return 1;
		}
	    }
532 533 534 535 536 537 538
	}
    }
    return 0;
}

""");

539 540 541 542
#
# finally, generate the ABI compatibility functions
#
for f in fkeys:
543 544 545 546 547
    output.write("""
/**
 * %s:
 * @ch:  character to validate
 *
548 549 550 551 552 553 554
 * This function is DEPRECATED.
""" % f);
    if max(Functs[f][0]) > 0:
        output.write(" * Use %s_ch or %sQ instead" % (f, f))
    else:
        output.write(" * Use %sQ instead" % f)
    output.write("""
555 556 557
 *
 * Returns true if argument valid, false otherwise
 */
558
""")
559 560 561 562 563 564 565 566 567 568 569
    output.write("int\n%s(unsigned int ch) {\n    return(%sQ(ch));\n}\n\n" % (f,f))
    header.write("XMLPUBFUN int XMLCALL\n\t\t%s(unsigned int ch);\n" % f);
#
# Run complete - write trailers and close the output files
#

header.write("""
#ifdef __cplusplus
}
#endif
#endif /* __XML_CHVALID_H__ */
570
""")
571 572

header.close()
573 574 575 576

output.write("""#define bottom_chvalid
#include "elfgcchack.h"
""")
577
output.close()
578