File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / libxml2 / doc / index.py
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 23:37:59 2012 UTC (12 years, 4 months ago) by misho
Branches: libxml2, MAIN
CVS tags: v2_9_1p0, v2_9_1, v2_8_0p0, v2_8_0, v2_7_8, HEAD
libxml2

    1: #!/usr/bin/python -u
    2: #
    3: # imports the API description and fills up a database with
    4: # name relevance to modules, functions or web pages
    5: #
    6: # Operation needed:
    7: # =================
    8: #
    9: # install mysqld, the python wrappers for mysql and libxml2, start mysqld
   10: # Change the root passwd of mysql:
   11: #    mysqladmin -u root password new_password
   12: # Create the new database xmlsoft
   13: #    mysqladmin -p create xmlsoft
   14: # Create a database user 'veillard' and give him passord access
   15: # change veillard and abcde with the right user name and passwd
   16: #    mysql -p
   17: #    password:
   18: #    mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
   19: #           IDENTIFIED BY 'abcde' WITH GRANT OPTION;
   20: #
   21: # As the user check the access:
   22: #    mysql -p xmlsoft
   23: #    Enter password:
   24: #    Welcome to the MySQL monitor....
   25: #    mysql> use xmlsoft
   26: #    Database changed
   27: #    mysql> quit
   28: #    Bye
   29: #
   30: # Then run the script in the doc subdir, it will create the symbols and
   31: # word tables and populate them with informations extracted from 
   32: # the libxml2-api.xml API description, and make then accessible read-only
   33: # by nobody@loaclhost the user expected to be Apache's one
   34: #
   35: # On the Apache configuration, make sure you have php support enabled
   36: #
   37: 
   38: import MySQLdb
   39: import libxml2
   40: import sys
   41: import string
   42: import os
   43: 
   44: #
   45: # We are not interested in parsing errors here
   46: #
   47: def callback(ctx, str):
   48:     return
   49: libxml2.registerErrorHandler(callback, None)
   50: 
   51: #
   52: # The dictionnary of tables required and the SQL command needed
   53: # to create them
   54: #
   55: TABLES={
   56:   "symbols" : """CREATE TABLE symbols (
   57:            name varchar(255) BINARY NOT NULL,
   58: 	   module varchar(255) BINARY NOT NULL,
   59:            type varchar(25) NOT NULL,
   60: 	   descr varchar(255),
   61: 	   UNIQUE KEY name (name),
   62: 	   KEY module (module))""",
   63:   "words" : """CREATE TABLE words (
   64:            name varchar(50) BINARY NOT NULL,
   65: 	   symbol varchar(255) BINARY NOT NULL,
   66:            relevance int,
   67: 	   KEY name (name),
   68: 	   KEY symbol (symbol),
   69: 	   UNIQUE KEY ID (name, symbol))""",
   70:   "wordsHTML" : """CREATE TABLE wordsHTML (
   71:            name varchar(50) BINARY NOT NULL,
   72: 	   resource varchar(255) BINARY NOT NULL,
   73: 	   section varchar(255),
   74: 	   id varchar(50),
   75:            relevance int,
   76: 	   KEY name (name),
   77: 	   KEY resource (resource),
   78: 	   UNIQUE KEY ref (name, resource))""",
   79:   "wordsArchive" : """CREATE TABLE wordsArchive (
   80:            name varchar(50) BINARY NOT NULL,
   81: 	   ID int(11) NOT NULL,
   82:            relevance int,
   83: 	   KEY name (name),
   84: 	   UNIQUE KEY ref (name, ID))""",
   85:   "pages" : """CREATE TABLE pages (
   86:            resource varchar(255) BINARY NOT NULL,
   87: 	   title varchar(255) BINARY NOT NULL,
   88: 	   UNIQUE KEY name (resource))""",
   89:   "archives" : """CREATE TABLE archives (
   90:            ID int(11) NOT NULL auto_increment,
   91:            resource varchar(255) BINARY NOT NULL,
   92: 	   title varchar(255) BINARY NOT NULL,
   93: 	   UNIQUE KEY id (ID,resource(255)),
   94: 	   INDEX (ID),
   95: 	   INDEX (resource))""",
   96:   "Queries" : """CREATE TABLE Queries (
   97:            ID int(11) NOT NULL auto_increment,
   98: 	   Value varchar(50) NOT NULL,
   99: 	   Count int(11) NOT NULL,
  100: 	   UNIQUE KEY id (ID,Value(35)),
  101: 	   INDEX (ID))""",
  102:   "AllQueries" : """CREATE TABLE AllQueries (
  103:            ID int(11) NOT NULL auto_increment,
  104: 	   Value varchar(50) NOT NULL,
  105: 	   Count int(11) NOT NULL,
  106: 	   UNIQUE KEY id (ID,Value(35)),
  107: 	   INDEX (ID))""",
  108: }
  109: 
  110: #
  111: # The XML API description file to parse
  112: #
  113: API="libxml2-api.xml"
  114: DB=None
  115: 
  116: #########################################################################
  117: #									#
  118: #                  MySQL database interfaces				#
  119: #									#
  120: #########################################################################
  121: def createTable(db, name):
  122:     global TABLES
  123: 
  124:     if db == None:
  125:         return -1
  126:     if name == None:
  127:         return -1
  128:     c = db.cursor()
  129: 
  130:     ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
  131:     if ret == 1:
  132:         print "Removed table %s" % (name)
  133:     print "Creating table %s" % (name)
  134:     try:
  135:         ret = c.execute(TABLES[name])
  136:     except:
  137:         print "Failed to create table %s" % (name)
  138: 	return -1
  139:     return ret
  140: 
  141: def checkTables(db, verbose = 1):
  142:     global TABLES
  143: 
  144:     if db == None:
  145:         return -1
  146:     c = db.cursor()
  147:     nbtables = c.execute("show tables")
  148:     if verbose:
  149: 	print "Found %d tables" % (nbtables)
  150:     tables = {}
  151:     i = 0
  152:     while i < nbtables:
  153:         l = c.fetchone()
  154: 	name = l[0]
  155: 	tables[name] = {}
  156:         i = i + 1
  157: 
  158:     for table in TABLES.keys():
  159:         if not tables.has_key(table):
  160: 	    print "table %s missing" % (table)
  161: 	    createTable(db, table)
  162: 	try:
  163: 	    ret = c.execute("SELECT count(*) from %s" % table);
  164: 	    row = c.fetchone()
  165: 	    if verbose:
  166: 		print "Table %s contains %d records" % (table, row[0])
  167: 	except:
  168: 	    print "Troubles with table %s : repairing" % (table)
  169: 	    ret = c.execute("repair table %s" % table);
  170: 	    print "repairing returned %d" % (ret)
  171: 	    ret = c.execute("SELECT count(*) from %s" % table);
  172: 	    row = c.fetchone()
  173: 	    print "Table %s contains %d records" % (table, row[0])
  174:     if verbose:
  175: 	print "checkTables finished"
  176: 
  177:     # make sure apache can access the tables read-only
  178:     try:
  179: 	ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
  180: 	ret = c.execute("GRANT INSERT,SELECT,UPDATE  ON xmlsoft.Queries TO nobody@localhost")
  181:     except:
  182:         pass
  183:     return 0
  184:     
  185: def openMySQL(db="xmlsoft", passwd=None, verbose = 1):
  186:     global DB
  187: 
  188:     if passwd == None:
  189:         try:
  190: 	    passwd = os.environ["MySQL_PASS"]
  191: 	except:
  192: 	    print "No password available, set environment MySQL_PASS"
  193: 	    sys.exit(1)
  194: 
  195:     DB = MySQLdb.connect(passwd=passwd, db=db)
  196:     if DB == None:
  197:         return -1
  198:     ret = checkTables(DB, verbose)
  199:     return ret
  200: 
  201: def updateWord(name, symbol, relevance):
  202:     global DB
  203: 
  204:     if DB == None:
  205:         openMySQL()
  206:     if DB == None:
  207:         return -1
  208:     if name == None:
  209:         return -1
  210:     if symbol == None:
  211:         return -1
  212: 
  213:     c = DB.cursor()
  214:     try:
  215: 	ret = c.execute(
  216: """INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
  217: 		(name, symbol, relevance))
  218:     except:
  219:         try:
  220: 	    ret = c.execute(
  221:     """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
  222: 		    (relevance, name, symbol))
  223: 	except:
  224: 	    print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
  225: 	    print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
  226: 	    print sys.exc_type, sys.exc_value
  227: 	    return -1
  228: 	     
  229:     return ret
  230: 
  231: def updateSymbol(name, module, type, desc):
  232:     global DB
  233: 
  234:     updateWord(name, name, 50)
  235:     if DB == None:
  236:         openMySQL()
  237:     if DB == None:
  238:         return -1
  239:     if name == None:
  240:         return -1
  241:     if module == None:
  242:         return -1
  243:     if type == None:
  244:         return -1
  245: 
  246:     try:
  247: 	desc = string.replace(desc, "'", " ")
  248: 	l = string.split(desc, ".")
  249: 	desc = l[0]
  250: 	desc = desc[0:99]
  251:     except:
  252:         desc = ""
  253: 
  254:     c = DB.cursor()
  255:     try:
  256: 	ret = c.execute(
  257: """INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
  258:                     (name, module, type, desc))
  259:     except:
  260:         try:
  261: 	    ret = c.execute(
  262: """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
  263:                     (module, type, desc, name))
  264:         except:
  265: 	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
  266: 	    print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
  267: 	    print sys.exc_type, sys.exc_value
  268: 	    return -1
  269: 	     
  270:     return ret
  271:         
  272: def addFunction(name, module, desc = ""):
  273:     return updateSymbol(name, module, 'function', desc)
  274: 
  275: def addMacro(name, module, desc = ""):
  276:     return updateSymbol(name, module, 'macro', desc)
  277: 
  278: def addEnum(name, module, desc = ""):
  279:     return updateSymbol(name, module, 'enum', desc)
  280: 
  281: def addStruct(name, module, desc = ""):
  282:     return updateSymbol(name, module, 'struct', desc)
  283: 
  284: def addConst(name, module, desc = ""):
  285:     return updateSymbol(name, module, 'const', desc)
  286: 
  287: def addType(name, module, desc = ""):
  288:     return updateSymbol(name, module, 'type', desc)
  289: 
  290: def addFunctype(name, module, desc = ""):
  291:     return updateSymbol(name, module, 'functype', desc)
  292: 
  293: def addPage(resource, title):
  294:     global DB
  295: 
  296:     if DB == None:
  297:         openMySQL()
  298:     if DB == None:
  299:         return -1
  300:     if resource == None:
  301:         return -1
  302: 
  303:     c = DB.cursor()
  304:     try:
  305: 	ret = c.execute(
  306: 	    """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
  307:                     (resource, title))
  308:     except:
  309:         try:
  310: 	    ret = c.execute(
  311: 		"""UPDATE pages SET title='%s' WHERE resource='%s'""" %
  312:                     (title, resource))
  313:         except:
  314: 	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
  315: 	    print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
  316: 	    print sys.exc_type, sys.exc_value
  317: 	    return -1
  318: 	     
  319:     return ret
  320: 
  321: def updateWordHTML(name, resource, desc, id, relevance):
  322:     global DB
  323: 
  324:     if DB == None:
  325:         openMySQL()
  326:     if DB == None:
  327:         return -1
  328:     if name == None:
  329:         return -1
  330:     if resource == None:
  331:         return -1
  332:     if id == None:
  333:         id = ""
  334:     if desc == None:
  335:         desc = ""
  336:     else:
  337: 	try:
  338: 	    desc = string.replace(desc, "'", " ")
  339: 	    desc = desc[0:99]
  340: 	except:
  341: 	    desc = ""
  342: 
  343:     c = DB.cursor()
  344:     try:
  345: 	ret = c.execute(
  346: """INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
  347:                     (name, resource, desc, id, relevance))
  348:     except:
  349:         try:
  350: 	    ret = c.execute(
  351: """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
  352:                     (desc, id, relevance, name, resource))
  353:         except:
  354: 	    print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
  355: 	    print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
  356: 	    print sys.exc_type, sys.exc_value
  357: 	    return -1
  358: 	     
  359:     return ret
  360: 
  361: def checkXMLMsgArchive(url):
  362:     global DB
  363: 
  364:     if DB == None:
  365:         openMySQL()
  366:     if DB == None:
  367:         return -1
  368:     if url == None:
  369:         return -1
  370: 
  371:     c = DB.cursor()
  372:     try:
  373: 	ret = c.execute(
  374: 	    """SELECT ID FROM archives WHERE resource='%s'""" % (url))
  375: 	row = c.fetchone()
  376: 	if row == None:
  377: 	    return -1
  378:     except:
  379: 	return -1
  380: 	     
  381:     return row[0]
  382:     
  383: def addXMLMsgArchive(url, title):
  384:     global DB
  385: 
  386:     if DB == None:
  387:         openMySQL()
  388:     if DB == None:
  389:         return -1
  390:     if url == None:
  391:         return -1
  392:     if title == None:
  393:         title = ""
  394:     else:
  395: 	title = string.replace(title, "'", " ")
  396: 	title = title[0:99]
  397: 
  398:     c = DB.cursor()
  399:     try:
  400:         cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
  401:         ret = c.execute(cmd)
  402: 	cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
  403:         ret = c.execute(cmd)
  404: 	row = c.fetchone()
  405: 	if row == None:
  406: 	    print "addXMLMsgArchive failed to get the ID: %s" % (url)
  407: 	    return -1
  408:     except:
  409:         print "addXMLMsgArchive failed command: %s" % (cmd)
  410: 	return -1
  411: 	     
  412:     return((int)(row[0]))
  413: 
  414: def updateWordArchive(name, id, relevance):
  415:     global DB
  416: 
  417:     if DB == None:
  418:         openMySQL()
  419:     if DB == None:
  420:         return -1
  421:     if name == None:
  422:         return -1
  423:     if id == None:
  424:         return -1
  425: 
  426:     c = DB.cursor()
  427:     try:
  428: 	ret = c.execute(
  429: """INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
  430:                     (name, id, relevance))
  431:     except:
  432:         try:
  433: 	    ret = c.execute(
  434: """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
  435:                     (relevance, name, id))
  436:         except:
  437: 	    print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
  438: 	    print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
  439: 	    print sys.exc_type, sys.exc_value
  440: 	    return -1
  441: 	     
  442:     return ret
  443: 
  444: #########################################################################
  445: #									#
  446: #                  Word dictionnary and analysis routines		#
  447: #									#
  448: #########################################################################
  449: 
  450: #
  451: # top 100 english word without the one len < 3 + own set
  452: #
  453: dropWords = {
  454:     'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
  455:     'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
  456:     'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
  457:     'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
  458:     'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
  459:     'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
  460:     'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
  461:     'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
  462:     'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
  463:     'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
  464:     'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
  465:     'down':0,
  466:     'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
  467: }
  468: 
  469: wordsDict = {}
  470: wordsDictHTML = {}
  471: wordsDictArchive = {}
  472: 
  473: def cleanupWordsString(str):
  474:     str = string.replace(str, ".", " ")
  475:     str = string.replace(str, "!", " ")
  476:     str = string.replace(str, "?", " ")
  477:     str = string.replace(str, ",", " ")
  478:     str = string.replace(str, "'", " ")
  479:     str = string.replace(str, '"', " ")
  480:     str = string.replace(str, ";", " ")
  481:     str = string.replace(str, "(", " ")
  482:     str = string.replace(str, ")", " ")
  483:     str = string.replace(str, "{", " ")
  484:     str = string.replace(str, "}", " ")
  485:     str = string.replace(str, "<", " ")
  486:     str = string.replace(str, ">", " ")
  487:     str = string.replace(str, "=", " ")
  488:     str = string.replace(str, "/", " ")
  489:     str = string.replace(str, "*", " ")
  490:     str = string.replace(str, ":", " ")
  491:     str = string.replace(str, "#", " ")
  492:     str = string.replace(str, "\\", " ")
  493:     str = string.replace(str, "\n", " ")
  494:     str = string.replace(str, "\r", " ")
  495:     str = string.replace(str, "\xc2", " ")
  496:     str = string.replace(str, "\xa0", " ")
  497:     return str
  498:     
  499: def cleanupDescrString(str):
  500:     str = string.replace(str, "'", " ")
  501:     str = string.replace(str, "\n", " ")
  502:     str = string.replace(str, "\r", " ")
  503:     str = string.replace(str, "\xc2", " ")
  504:     str = string.replace(str, "\xa0", " ")
  505:     l = string.split(str)
  506:     str = string.join(str)
  507:     return str
  508: 
  509: def splitIdentifier(str):
  510:     ret = []
  511:     while str != "":
  512:         cur = string.lower(str[0])
  513: 	str = str[1:]
  514: 	if ((cur < 'a') or (cur > 'z')):
  515: 	    continue
  516: 	while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
  517: 	    cur = cur + string.lower(str[0])
  518: 	    str = str[1:]
  519: 	while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
  520: 	    cur = cur + str[0]
  521: 	    str = str[1:]
  522: 	while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
  523: 	    str = str[1:]
  524: 	ret.append(cur)
  525:     return ret
  526: 
  527: def addWord(word, module, symbol, relevance):
  528:     global wordsDict
  529: 
  530:     if word == None or len(word) < 3:
  531:         return -1
  532:     if module == None or symbol == None:
  533:         return -1
  534:     if dropWords.has_key(word):
  535:         return 0
  536:     if ord(word[0]) > 0x80:
  537:         return 0
  538: 
  539:     if wordsDict.has_key(word):
  540:         d = wordsDict[word]
  541: 	if d == None:
  542: 	    return 0
  543: 	if len(d) > 500:
  544: 	    wordsDict[word] = None
  545: 	    return 0
  546: 	try:
  547: 	    relevance = relevance + d[(module, symbol)]
  548: 	except:
  549: 	    pass
  550:     else:
  551:         wordsDict[word] = {}
  552:     wordsDict[word][(module, symbol)] = relevance
  553:     return relevance
  554:     
  555: def addString(str, module, symbol, relevance):
  556:     if str == None or len(str) < 3:
  557:         return -1
  558:     ret = 0
  559:     str = cleanupWordsString(str)
  560:     l = string.split(str)
  561:     for word in l:
  562: 	if len(word) > 2:
  563: 	    ret = ret + addWord(word, module, symbol, 5)
  564: 
  565:     return ret
  566: 
  567: def addWordHTML(word, resource, id, section, relevance):
  568:     global wordsDictHTML
  569: 
  570:     if word == None or len(word) < 3:
  571:         return -1
  572:     if resource == None or section == None:
  573:         return -1
  574:     if dropWords.has_key(word):
  575:         return 0
  576:     if ord(word[0]) > 0x80:
  577:         return 0
  578: 
  579:     section = cleanupDescrString(section)
  580: 
  581:     if wordsDictHTML.has_key(word):
  582:         d = wordsDictHTML[word]
  583: 	if d == None:
  584: 	    print "skipped %s" % (word)
  585: 	    return 0
  586: 	try:
  587: 	    (r,i,s) = d[resource]
  588: 	    if i != None:
  589: 	        id = i
  590: 	    if s != None:
  591: 	        section = s
  592: 	    relevance = relevance + r
  593: 	except:
  594: 	    pass
  595:     else:
  596:         wordsDictHTML[word] = {}
  597:     d = wordsDictHTML[word];
  598:     d[resource] = (relevance, id, section)
  599:     return relevance
  600:     
  601: def addStringHTML(str, resource, id, section, relevance):
  602:     if str == None or len(str) < 3:
  603:         return -1
  604:     ret = 0
  605:     str = cleanupWordsString(str)
  606:     l = string.split(str)
  607:     for word in l:
  608: 	if len(word) > 2:
  609: 	    try:
  610: 		r = addWordHTML(word, resource, id, section, relevance)
  611: 		if r < 0:
  612: 		    print "addWordHTML failed: %s %s" % (word, resource)
  613: 		ret = ret + r
  614: 	    except:
  615: 		print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
  616: 		print sys.exc_type, sys.exc_value
  617: 
  618:     return ret
  619: 
  620: def addWordArchive(word, id, relevance):
  621:     global wordsDictArchive
  622: 
  623:     if word == None or len(word) < 3:
  624:         return -1
  625:     if id == None or id == -1:
  626:         return -1
  627:     if dropWords.has_key(word):
  628:         return 0
  629:     if ord(word[0]) > 0x80:
  630:         return 0
  631: 
  632:     if wordsDictArchive.has_key(word):
  633:         d = wordsDictArchive[word]
  634: 	if d == None:
  635: 	    print "skipped %s" % (word)
  636: 	    return 0
  637: 	try:
  638: 	    r = d[id]
  639: 	    relevance = relevance + r
  640: 	except:
  641: 	    pass
  642:     else:
  643:         wordsDictArchive[word] = {}
  644:     d = wordsDictArchive[word];
  645:     d[id] = relevance
  646:     return relevance
  647:     
  648: def addStringArchive(str, id, relevance):
  649:     if str == None or len(str) < 3:
  650:         return -1
  651:     ret = 0
  652:     str = cleanupWordsString(str)
  653:     l = string.split(str)
  654:     for word in l:
  655:         i = len(word)
  656: 	if i > 2:
  657: 	    try:
  658: 		r = addWordArchive(word, id, relevance)
  659: 		if r < 0:
  660: 		    print "addWordArchive failed: %s %s" % (word, id)
  661: 		else:
  662: 		    ret = ret + r
  663: 	    except:
  664: 		print "addWordArchive failed: %s %s %d" % (word, id, relevance)
  665: 		print sys.exc_type, sys.exc_value
  666:     return ret
  667: 
  668: #########################################################################
  669: #									#
  670: #                  XML API description analysis				#
  671: #									#
  672: #########################################################################
  673: 
  674: def loadAPI(filename):
  675:     doc = libxml2.parseFile(filename)
  676:     print "loaded %s" % (filename)
  677:     return doc
  678: 
  679: def foundExport(file, symbol):
  680:     if file == None:
  681:         return 0
  682:     if symbol == None:
  683:         return 0
  684:     addFunction(symbol, file)
  685:     l = splitIdentifier(symbol)
  686:     for word in l:
  687: 	addWord(word, file, symbol, 10)
  688:     return 1
  689:      
  690: def analyzeAPIFile(top):
  691:     count = 0
  692:     name = top.prop("name")
  693:     cur = top.children
  694:     while cur != None:
  695:         if cur.type == 'text':
  696: 	    cur = cur.next
  697: 	    continue
  698: 	if cur.name == "exports":
  699: 	    count = count + foundExport(name, cur.prop("symbol"))
  700: 	else:
  701: 	    print "unexpected element %s in API doc <file name='%s'>" % (name)
  702:         cur = cur.next
  703:     return count
  704: 
  705: def analyzeAPIFiles(top):
  706:     count = 0
  707:     cur = top.children
  708:         
  709:     while cur != None:
  710:         if cur.type == 'text':
  711: 	    cur = cur.next
  712: 	    continue
  713: 	if cur.name == "file":
  714: 	    count = count + analyzeAPIFile(cur)
  715: 	else:
  716: 	    print "unexpected element %s in API doc <files>" % (cur.name)
  717:         cur = cur.next
  718:     return count
  719: 
  720: def analyzeAPIEnum(top):
  721:     file = top.prop("file")
  722:     if file == None:
  723:         return 0
  724:     symbol = top.prop("name")
  725:     if symbol == None:
  726:         return 0
  727: 
  728:     addEnum(symbol, file)
  729:     l = splitIdentifier(symbol)
  730:     for word in l:
  731: 	addWord(word, file, symbol, 10)
  732: 
  733:     return 1
  734: 
  735: def analyzeAPIConst(top):
  736:     file = top.prop("file")
  737:     if file == None:
  738:         return 0
  739:     symbol = top.prop("name")
  740:     if symbol == None:
  741:         return 0
  742: 
  743:     addConst(symbol, file)
  744:     l = splitIdentifier(symbol)
  745:     for word in l:
  746: 	addWord(word, file, symbol, 10)
  747: 
  748:     return 1
  749: 
  750: def analyzeAPIType(top):
  751:     file = top.prop("file")
  752:     if file == None:
  753:         return 0
  754:     symbol = top.prop("name")
  755:     if symbol == None:
  756:         return 0
  757: 
  758:     addType(symbol, file)
  759:     l = splitIdentifier(symbol)
  760:     for word in l:
  761: 	addWord(word, file, symbol, 10)
  762:     return 1
  763: 
  764: def analyzeAPIFunctype(top):
  765:     file = top.prop("file")
  766:     if file == None:
  767:         return 0
  768:     symbol = top.prop("name")
  769:     if symbol == None:
  770:         return 0
  771: 
  772:     addFunctype(symbol, file)
  773:     l = splitIdentifier(symbol)
  774:     for word in l:
  775: 	addWord(word, file, symbol, 10)
  776:     return 1
  777: 
  778: def analyzeAPIStruct(top):
  779:     file = top.prop("file")
  780:     if file == None:
  781:         return 0
  782:     symbol = top.prop("name")
  783:     if symbol == None:
  784:         return 0
  785: 
  786:     addStruct(symbol, file)
  787:     l = splitIdentifier(symbol)
  788:     for word in l:
  789: 	addWord(word, file, symbol, 10)
  790: 
  791:     info = top.prop("info")
  792:     if info != None:
  793: 	info = string.replace(info, "'", " ")
  794: 	info = string.strip(info)
  795: 	l = string.split(info)
  796: 	for word in l:
  797: 	    if len(word) > 2:
  798: 		addWord(word, file, symbol, 5)
  799:     return 1
  800: 
  801: def analyzeAPIMacro(top):
  802:     file = top.prop("file")
  803:     if file == None:
  804:         return 0
  805:     symbol = top.prop("name")
  806:     if symbol == None:
  807:         return 0
  808:     symbol = string.replace(symbol, "'", " ")
  809:     symbol = string.strip(symbol)
  810: 
  811:     info = None
  812:     cur = top.children
  813:     while cur != None:
  814:         if cur.type == 'text':
  815: 	    cur = cur.next
  816: 	    continue
  817: 	if cur.name == "info":
  818: 	    info = cur.content
  819: 	    break
  820:         cur = cur.next
  821: 
  822:     l = splitIdentifier(symbol)
  823:     for word in l:
  824: 	addWord(word, file, symbol, 10)
  825: 
  826:     if info == None:
  827: 	addMacro(symbol, file)
  828:         print "Macro %s description has no <info>" % (symbol)
  829:         return 0
  830: 
  831:     info = string.replace(info, "'", " ")
  832:     info = string.strip(info)
  833:     addMacro(symbol, file, info)
  834:     l = string.split(info)
  835:     for word in l:
  836: 	if len(word) > 2:
  837: 	    addWord(word, file, symbol, 5)
  838:     return 1
  839: 
  840: def analyzeAPIFunction(top):
  841:     file = top.prop("file")
  842:     if file == None:
  843:         return 0
  844:     symbol = top.prop("name")
  845:     if symbol == None:
  846:         return 0
  847: 
  848:     symbol = string.replace(symbol, "'", " ")
  849:     symbol = string.strip(symbol)
  850:     info = None
  851:     cur = top.children
  852:     while cur != None:
  853:         if cur.type == 'text':
  854: 	    cur = cur.next
  855: 	    continue
  856: 	if cur.name == "info":
  857: 	    info = cur.content
  858: 	elif cur.name == "return":
  859: 	    rinfo = cur.prop("info")
  860: 	    if rinfo != None:
  861: 		rinfo = string.replace(rinfo, "'", " ")
  862: 		rinfo = string.strip(rinfo)
  863: 	        addString(rinfo, file, symbol, 7)
  864: 	elif cur.name == "arg":
  865: 	    ainfo = cur.prop("info")
  866: 	    if ainfo != None:
  867: 		ainfo = string.replace(ainfo, "'", " ")
  868: 		ainfo = string.strip(ainfo)
  869: 	        addString(ainfo, file, symbol, 5)
  870: 	    name = cur.prop("name")
  871: 	    if name != None:
  872: 		name = string.replace(name, "'", " ")
  873: 		name = string.strip(name)
  874: 	        addWord(name, file, symbol, 7)
  875:         cur = cur.next
  876:     if info == None:
  877:         print "Function %s description has no <info>" % (symbol)
  878: 	addFunction(symbol, file, "")
  879:     else:
  880:         info = string.replace(info, "'", " ")
  881: 	info = string.strip(info)
  882: 	addFunction(symbol, file, info)
  883:         addString(info, file, symbol, 5)
  884: 
  885:     l = splitIdentifier(symbol)
  886:     for word in l:
  887: 	addWord(word, file, symbol, 10)
  888: 
  889:     return 1
  890: 
  891: def analyzeAPISymbols(top):
  892:     count = 0
  893:     cur = top.children
  894:         
  895:     while cur != None:
  896:         if cur.type == 'text':
  897: 	    cur = cur.next
  898: 	    continue
  899: 	if cur.name == "macro":
  900: 	    count = count + analyzeAPIMacro(cur)
  901: 	elif cur.name == "function":
  902: 	    count = count + analyzeAPIFunction(cur)
  903: 	elif cur.name == "const":
  904: 	    count = count + analyzeAPIConst(cur)
  905: 	elif cur.name == "typedef":
  906: 	    count = count + analyzeAPIType(cur)
  907: 	elif cur.name == "struct":
  908: 	    count = count + analyzeAPIStruct(cur)
  909: 	elif cur.name == "enum":
  910: 	    count = count + analyzeAPIEnum(cur)
  911: 	elif cur.name == "functype":
  912: 	    count = count + analyzeAPIFunctype(cur)
  913: 	else:
  914: 	    print "unexpected element %s in API doc <files>" % (cur.name)
  915:         cur = cur.next
  916:     return count
  917: 
  918: def analyzeAPI(doc):
  919:     count = 0
  920:     if doc == None:
  921:         return -1
  922:     root = doc.getRootElement()
  923:     if root.name != "api":
  924:         print "Unexpected root name"
  925:         return -1
  926:     cur = root.children
  927:     while cur != None:
  928:         if cur.type == 'text':
  929: 	    cur = cur.next
  930: 	    continue
  931: 	if cur.name == "files":
  932: 	    pass
  933: #	    count = count + analyzeAPIFiles(cur)
  934: 	elif cur.name == "symbols":
  935: 	    count = count + analyzeAPISymbols(cur)
  936: 	else:
  937: 	    print "unexpected element %s in API doc" % (cur.name)
  938:         cur = cur.next
  939:     return count
  940: 
  941: #########################################################################
  942: #									#
  943: #                  Web pages parsing and analysis			#
  944: #									#
  945: #########################################################################
  946: 
  947: import glob
  948: 
  949: def analyzeHTMLText(doc, resource, p, section, id):
  950:     words = 0
  951:     try:
  952: 	content = p.content
  953: 	words = words + addStringHTML(content, resource, id, section, 5)
  954:     except:
  955:         return -1
  956:     return words
  957: 
  958: def analyzeHTMLPara(doc, resource, p, section, id):
  959:     words = 0
  960:     try:
  961: 	content = p.content
  962: 	words = words + addStringHTML(content, resource, id, section, 5)
  963:     except:
  964:         return -1
  965:     return words
  966: 
  967: def analyzeHTMLPre(doc, resource, p, section, id):
  968:     words = 0
  969:     try:
  970: 	content = p.content
  971: 	words = words + addStringHTML(content, resource, id, section, 5)
  972:     except:
  973:         return -1
  974:     return words
  975: 
  976: def analyzeHTML(doc, resource, p, section, id):
  977:     words = 0
  978:     try:
  979: 	content = p.content
  980: 	words = words + addStringHTML(content, resource, id, section, 5)
  981:     except:
  982:         return -1
  983:     return words
  984: 
  985: def analyzeHTML(doc, resource):
  986:     para = 0;
  987:     ctxt = doc.xpathNewContext()
  988:     try:
  989: 	res = ctxt.xpathEval("//head/title")
  990: 	title = res[0].content
  991:     except:
  992:         title = "Page %s" % (resource)
  993:     addPage(resource, title)
  994:     try:
  995: 	items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
  996: 	section = title
  997: 	id = ""
  998: 	for item in items:
  999: 	    if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
 1000: 	        section = item.content
 1001: 		if item.prop("id"):
 1002: 		    id = item.prop("id")
 1003: 		elif item.prop("name"):
 1004: 		    id = item.prop("name")
 1005: 	    elif item.type == 'text':
 1006: 	        analyzeHTMLText(doc, resource, item, section, id)
 1007: 		para = para + 1
 1008: 	    elif item.name == 'p':
 1009: 	        analyzeHTMLPara(doc, resource, item, section, id)
 1010: 		para = para + 1
 1011: 	    elif item.name == 'pre':
 1012: 	        analyzeHTMLPre(doc, resource, item, section, id)
 1013: 		para = para + 1
 1014: 	    else:
 1015: 	        print "Page %s, unexpected %s element" % (resource, item.name)
 1016:     except:
 1017:         print "Page %s: problem analyzing" % (resource)
 1018: 	print sys.exc_type, sys.exc_value
 1019: 
 1020:     return para
 1021: 
 1022: def analyzeHTMLPages():
 1023:     ret = 0
 1024:     HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
 1025:     for html in HTMLfiles:
 1026: 	if html[0:3] == "API":
 1027: 	    continue
 1028: 	if html == "xml.html":
 1029: 	    continue
 1030: 	try:
 1031: 	    doc = libxml2.parseFile(html)
 1032: 	except:
 1033: 	    doc = libxml2.htmlParseFile(html, None)
 1034: 	try:
 1035: 	    res = analyzeHTML(doc, html)
 1036: 	    print "Parsed %s : %d paragraphs" % (html, res)
 1037: 	    ret = ret + 1
 1038: 	except:
 1039: 	    print "could not parse %s" % (html)
 1040:     return ret
 1041: 
 1042: #########################################################################
 1043: #									#
 1044: #                  Mail archives parsing and analysis			#
 1045: #									#
 1046: #########################################################################
 1047: 
 1048: import time
 1049: 
 1050: def getXMLDateArchive(t = None):
 1051:     if t == None:
 1052: 	t = time.time()
 1053:     T = time.gmtime(t)
 1054:     month = time.strftime("%B", T)
 1055:     year = T[0]
 1056:     url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
 1057:     return url
 1058: 
 1059: def scanXMLMsgArchive(url, title, force = 0):
 1060:     if url == None or title == None:
 1061:         return 0
 1062: 
 1063:     ID = checkXMLMsgArchive(url)
 1064:     if force == 0 and ID != -1:
 1065:         return 0
 1066: 
 1067:     if ID == -1:
 1068: 	ID = addXMLMsgArchive(url, title)
 1069: 	if ID == -1:
 1070: 	    return 0
 1071: 
 1072:     try:
 1073:         print "Loading %s" % (url)
 1074:         doc = libxml2.htmlParseFile(url, None);
 1075:     except:
 1076:         doc = None
 1077:     if doc == None:
 1078:         print "Failed to parse %s" % (url)
 1079: 	return 0
 1080: 
 1081:     addStringArchive(title, ID, 20)
 1082:     ctxt = doc.xpathNewContext()
 1083:     texts = ctxt.xpathEval("//pre//text()")
 1084:     for text in texts:
 1085:         addStringArchive(text.content, ID, 5)
 1086: 
 1087:     return 1
 1088: 
 1089: def scanXMLDateArchive(t = None, force = 0):
 1090:     global wordsDictArchive
 1091: 
 1092:     wordsDictArchive = {}
 1093: 
 1094:     url = getXMLDateArchive(t)
 1095:     print "loading %s" % (url)
 1096:     try:
 1097: 	doc = libxml2.htmlParseFile(url, None);
 1098:     except:
 1099:         doc = None
 1100:     if doc == None:
 1101:         print "Failed to parse %s" % (url)
 1102: 	return -1
 1103:     ctxt = doc.xpathNewContext()
 1104:     anchors = ctxt.xpathEval("//a[@href]")
 1105:     links = 0
 1106:     newmsg = 0
 1107:     for anchor in anchors:
 1108: 	href = anchor.prop("href")
 1109: 	if href == None or href[0:3] != "msg":
 1110: 	    continue
 1111:         try:
 1112: 	    links = links + 1
 1113: 
 1114: 	    msg = libxml2.buildURI(href, url)
 1115: 	    title = anchor.content
 1116: 	    if title != None and title[0:4] == 'Re: ':
 1117: 	        title = title[4:]
 1118: 	    if title != None and title[0:6] == '[xml] ':
 1119: 	        title = title[6:]
 1120: 	    newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
 1121: 
 1122: 	except:
 1123: 	    pass
 1124: 
 1125:     return newmsg
 1126:     
 1127: 
 1128: #########################################################################
 1129: #									#
 1130: #          Main code: open the DB, the API XML and analyze it		#
 1131: #									#
 1132: #########################################################################
 1133: def analyzeArchives(t = None, force = 0):
 1134:     global wordsDictArchive
 1135: 
 1136:     ret = scanXMLDateArchive(t, force)
 1137:     print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
 1138: 
 1139:     i = 0
 1140:     skipped = 0
 1141:     for word in wordsDictArchive.keys():
 1142: 	refs = wordsDictArchive[word]
 1143: 	if refs  == None:
 1144: 	    skipped = skipped + 1
 1145: 	    continue;
 1146: 	for id in refs.keys():
 1147: 	    relevance = refs[id]
 1148: 	    updateWordArchive(word, id, relevance)
 1149: 	    i = i + 1
 1150: 
 1151:     print "Found %d associations in HTML pages" % (i)
 1152: 
 1153: def analyzeHTMLTop():
 1154:     global wordsDictHTML
 1155: 
 1156:     ret = analyzeHTMLPages()
 1157:     print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
 1158: 
 1159:     i = 0
 1160:     skipped = 0
 1161:     for word in wordsDictHTML.keys():
 1162: 	refs = wordsDictHTML[word]
 1163: 	if refs  == None:
 1164: 	    skipped = skipped + 1
 1165: 	    continue;
 1166: 	for resource in refs.keys():
 1167: 	    (relevance, id, section) = refs[resource]
 1168: 	    updateWordHTML(word, resource, section, id, relevance)
 1169: 	    i = i + 1
 1170: 
 1171:     print "Found %d associations in HTML pages" % (i)
 1172: 
 1173: def analyzeAPITop():
 1174:     global wordsDict
 1175:     global API
 1176: 
 1177:     try:
 1178: 	doc = loadAPI(API)
 1179: 	ret = analyzeAPI(doc)
 1180: 	print "Analyzed %d blocs" % (ret)
 1181: 	doc.freeDoc()
 1182:     except:
 1183: 	print "Failed to parse and analyze %s" % (API)
 1184: 	print sys.exc_type, sys.exc_value
 1185: 	sys.exit(1)
 1186: 
 1187:     print "Indexed %d words" % (len(wordsDict))
 1188:     i = 0
 1189:     skipped = 0
 1190:     for word in wordsDict.keys():
 1191: 	refs = wordsDict[word]
 1192: 	if refs  == None:
 1193: 	    skipped = skipped + 1
 1194: 	    continue;
 1195: 	for (module, symbol) in refs.keys():
 1196: 	    updateWord(word, symbol, refs[(module, symbol)])
 1197: 	    i = i + 1
 1198: 
 1199:     print "Found %d associations, skipped %d words" % (i, skipped)
 1200: 
 1201: def usage():
 1202:     print "Usage index.py [--force] [--archive]  [--archive-year year] [--archive-month month] [--API] [--docs]"
 1203:     sys.exit(1)
 1204: 
 1205: def main():
 1206:     try:
 1207: 	openMySQL()
 1208:     except:
 1209: 	print "Failed to open the database"
 1210: 	print sys.exc_type, sys.exc_value
 1211: 	sys.exit(1)
 1212: 
 1213:     args = sys.argv[1:]
 1214:     force = 0
 1215:     if args:
 1216:         i = 0
 1217: 	while i < len(args):
 1218: 	    if args[i] == '--force':
 1219: 	        force = 1
 1220: 	    elif args[i] == '--archive':
 1221: 	        analyzeArchives(None, force)
 1222: 	    elif args[i] == '--archive-year':
 1223: 	        i = i + 1;
 1224: 		year = args[i]
 1225: 		months = ["January" , "February", "March", "April", "May",
 1226: 			  "June", "July", "August", "September", "October",
 1227: 			  "November", "December"];
 1228: 	        for month in months:
 1229: 		    try:
 1230: 		        str = "%s-%s" % (year, month)
 1231: 			T = time.strptime(str, "%Y-%B")
 1232: 			t = time.mktime(T) + 3600 * 24 * 10;
 1233: 			analyzeArchives(t, force)
 1234: 		    except:
 1235: 			print "Failed to index month archive:"
 1236: 			print sys.exc_type, sys.exc_value
 1237: 	    elif args[i] == '--archive-month':
 1238: 	        i = i + 1;
 1239: 		month = args[i]
 1240: 		try:
 1241: 		    T = time.strptime(month, "%Y-%B")
 1242: 		    t = time.mktime(T) + 3600 * 24 * 10;
 1243: 		    analyzeArchives(t, force)
 1244: 		except:
 1245: 		    print "Failed to index month archive:"
 1246: 		    print sys.exc_type, sys.exc_value
 1247: 	    elif args[i] == '--API':
 1248: 	        analyzeAPITop()
 1249: 	    elif args[i] == '--docs':
 1250: 	        analyzeHTMLTop()
 1251: 	    else:
 1252: 	        usage()
 1253: 	    i = i + 1
 1254:     else:
 1255:         usage()
 1256: 
 1257: if __name__ == "__main__":
 1258:     main()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>