Annotation of embedaddon/libxml2/doc/index.py, revision 1.1
1.1 ! misho 1: #!/usr/bin/python -u
! 2: #
! 3: # imports the API description and fills up a database with
! 4: # name relevance to modules, functions or web pages
! 5: #
! 6: # Operation needed:
! 7: # =================
! 8: #
! 9: # install mysqld, the python wrappers for mysql and libxml2, start mysqld
! 10: # Change the root passwd of mysql:
! 11: # mysqladmin -u root password new_password
! 12: # Create the new database xmlsoft
! 13: # mysqladmin -p create xmlsoft
! 14: # Create a database user 'veillard' and give him passord access
! 15: # change veillard and abcde with the right user name and passwd
! 16: # mysql -p
! 17: # password:
! 18: # mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
! 19: # IDENTIFIED BY 'abcde' WITH GRANT OPTION;
! 20: #
! 21: # As the user check the access:
! 22: # mysql -p xmlsoft
! 23: # Enter password:
! 24: # Welcome to the MySQL monitor....
! 25: # mysql> use xmlsoft
! 26: # Database changed
! 27: # mysql> quit
! 28: # Bye
! 29: #
! 30: # Then run the script in the doc subdir, it will create the symbols and
! 31: # word tables and populate them with informations extracted from
! 32: # the libxml2-api.xml API description, and make then accessible read-only
! 33: # by nobody@loaclhost the user expected to be Apache's one
! 34: #
! 35: # On the Apache configuration, make sure you have php support enabled
! 36: #
! 37:
! 38: import MySQLdb
! 39: import libxml2
! 40: import sys
! 41: import string
! 42: import os
! 43:
! 44: #
! 45: # We are not interested in parsing errors here
! 46: #
! 47: def callback(ctx, str):
! 48: return
! 49: libxml2.registerErrorHandler(callback, None)
! 50:
! 51: #
! 52: # The dictionnary of tables required and the SQL command needed
! 53: # to create them
! 54: #
! 55: TABLES={
! 56: "symbols" : """CREATE TABLE symbols (
! 57: name varchar(255) BINARY NOT NULL,
! 58: module varchar(255) BINARY NOT NULL,
! 59: type varchar(25) NOT NULL,
! 60: descr varchar(255),
! 61: UNIQUE KEY name (name),
! 62: KEY module (module))""",
! 63: "words" : """CREATE TABLE words (
! 64: name varchar(50) BINARY NOT NULL,
! 65: symbol varchar(255) BINARY NOT NULL,
! 66: relevance int,
! 67: KEY name (name),
! 68: KEY symbol (symbol),
! 69: UNIQUE KEY ID (name, symbol))""",
! 70: "wordsHTML" : """CREATE TABLE wordsHTML (
! 71: name varchar(50) BINARY NOT NULL,
! 72: resource varchar(255) BINARY NOT NULL,
! 73: section varchar(255),
! 74: id varchar(50),
! 75: relevance int,
! 76: KEY name (name),
! 77: KEY resource (resource),
! 78: UNIQUE KEY ref (name, resource))""",
! 79: "wordsArchive" : """CREATE TABLE wordsArchive (
! 80: name varchar(50) BINARY NOT NULL,
! 81: ID int(11) NOT NULL,
! 82: relevance int,
! 83: KEY name (name),
! 84: UNIQUE KEY ref (name, ID))""",
! 85: "pages" : """CREATE TABLE pages (
! 86: resource varchar(255) BINARY NOT NULL,
! 87: title varchar(255) BINARY NOT NULL,
! 88: UNIQUE KEY name (resource))""",
! 89: "archives" : """CREATE TABLE archives (
! 90: ID int(11) NOT NULL auto_increment,
! 91: resource varchar(255) BINARY NOT NULL,
! 92: title varchar(255) BINARY NOT NULL,
! 93: UNIQUE KEY id (ID,resource(255)),
! 94: INDEX (ID),
! 95: INDEX (resource))""",
! 96: "Queries" : """CREATE TABLE Queries (
! 97: ID int(11) NOT NULL auto_increment,
! 98: Value varchar(50) NOT NULL,
! 99: Count int(11) NOT NULL,
! 100: UNIQUE KEY id (ID,Value(35)),
! 101: INDEX (ID))""",
! 102: "AllQueries" : """CREATE TABLE AllQueries (
! 103: ID int(11) NOT NULL auto_increment,
! 104: Value varchar(50) NOT NULL,
! 105: Count int(11) NOT NULL,
! 106: UNIQUE KEY id (ID,Value(35)),
! 107: INDEX (ID))""",
! 108: }
! 109:
! 110: #
! 111: # The XML API description file to parse
! 112: #
! 113: API="libxml2-api.xml"
! 114: DB=None
! 115:
! 116: #########################################################################
! 117: # #
! 118: # MySQL database interfaces #
! 119: # #
! 120: #########################################################################
! 121: def createTable(db, name):
! 122: global TABLES
! 123:
! 124: if db == None:
! 125: return -1
! 126: if name == None:
! 127: return -1
! 128: c = db.cursor()
! 129:
! 130: ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
! 131: if ret == 1:
! 132: print "Removed table %s" % (name)
! 133: print "Creating table %s" % (name)
! 134: try:
! 135: ret = c.execute(TABLES[name])
! 136: except:
! 137: print "Failed to create table %s" % (name)
! 138: return -1
! 139: return ret
! 140:
! 141: def checkTables(db, verbose = 1):
! 142: global TABLES
! 143:
! 144: if db == None:
! 145: return -1
! 146: c = db.cursor()
! 147: nbtables = c.execute("show tables")
! 148: if verbose:
! 149: print "Found %d tables" % (nbtables)
! 150: tables = {}
! 151: i = 0
! 152: while i < nbtables:
! 153: l = c.fetchone()
! 154: name = l[0]
! 155: tables[name] = {}
! 156: i = i + 1
! 157:
! 158: for table in TABLES.keys():
! 159: if not tables.has_key(table):
! 160: print "table %s missing" % (table)
! 161: createTable(db, table)
! 162: try:
! 163: ret = c.execute("SELECT count(*) from %s" % table);
! 164: row = c.fetchone()
! 165: if verbose:
! 166: print "Table %s contains %d records" % (table, row[0])
! 167: except:
! 168: print "Troubles with table %s : repairing" % (table)
! 169: ret = c.execute("repair table %s" % table);
! 170: print "repairing returned %d" % (ret)
! 171: ret = c.execute("SELECT count(*) from %s" % table);
! 172: row = c.fetchone()
! 173: print "Table %s contains %d records" % (table, row[0])
! 174: if verbose:
! 175: print "checkTables finished"
! 176:
! 177: # make sure apache can access the tables read-only
! 178: try:
! 179: ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
! 180: ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost")
! 181: except:
! 182: pass
! 183: return 0
! 184:
! 185: def openMySQL(db="xmlsoft", passwd=None, verbose = 1):
! 186: global DB
! 187:
! 188: if passwd == None:
! 189: try:
! 190: passwd = os.environ["MySQL_PASS"]
! 191: except:
! 192: print "No password available, set environment MySQL_PASS"
! 193: sys.exit(1)
! 194:
! 195: DB = MySQLdb.connect(passwd=passwd, db=db)
! 196: if DB == None:
! 197: return -1
! 198: ret = checkTables(DB, verbose)
! 199: return ret
! 200:
! 201: def updateWord(name, symbol, relevance):
! 202: global DB
! 203:
! 204: if DB == None:
! 205: openMySQL()
! 206: if DB == None:
! 207: return -1
! 208: if name == None:
! 209: return -1
! 210: if symbol == None:
! 211: return -1
! 212:
! 213: c = DB.cursor()
! 214: try:
! 215: ret = c.execute(
! 216: """INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
! 217: (name, symbol, relevance))
! 218: except:
! 219: try:
! 220: ret = c.execute(
! 221: """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
! 222: (relevance, name, symbol))
! 223: except:
! 224: print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
! 225: print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
! 226: print sys.exc_type, sys.exc_value
! 227: return -1
! 228:
! 229: return ret
! 230:
! 231: def updateSymbol(name, module, type, desc):
! 232: global DB
! 233:
! 234: updateWord(name, name, 50)
! 235: if DB == None:
! 236: openMySQL()
! 237: if DB == None:
! 238: return -1
! 239: if name == None:
! 240: return -1
! 241: if module == None:
! 242: return -1
! 243: if type == None:
! 244: return -1
! 245:
! 246: try:
! 247: desc = string.replace(desc, "'", " ")
! 248: l = string.split(desc, ".")
! 249: desc = l[0]
! 250: desc = desc[0:99]
! 251: except:
! 252: desc = ""
! 253:
! 254: c = DB.cursor()
! 255: try:
! 256: ret = c.execute(
! 257: """INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
! 258: (name, module, type, desc))
! 259: except:
! 260: try:
! 261: ret = c.execute(
! 262: """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
! 263: (module, type, desc, name))
! 264: except:
! 265: print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
! 266: print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
! 267: print sys.exc_type, sys.exc_value
! 268: return -1
! 269:
! 270: return ret
! 271:
! 272: def addFunction(name, module, desc = ""):
! 273: return updateSymbol(name, module, 'function', desc)
! 274:
! 275: def addMacro(name, module, desc = ""):
! 276: return updateSymbol(name, module, 'macro', desc)
! 277:
! 278: def addEnum(name, module, desc = ""):
! 279: return updateSymbol(name, module, 'enum', desc)
! 280:
! 281: def addStruct(name, module, desc = ""):
! 282: return updateSymbol(name, module, 'struct', desc)
! 283:
! 284: def addConst(name, module, desc = ""):
! 285: return updateSymbol(name, module, 'const', desc)
! 286:
! 287: def addType(name, module, desc = ""):
! 288: return updateSymbol(name, module, 'type', desc)
! 289:
! 290: def addFunctype(name, module, desc = ""):
! 291: return updateSymbol(name, module, 'functype', desc)
! 292:
! 293: def addPage(resource, title):
! 294: global DB
! 295:
! 296: if DB == None:
! 297: openMySQL()
! 298: if DB == None:
! 299: return -1
! 300: if resource == None:
! 301: return -1
! 302:
! 303: c = DB.cursor()
! 304: try:
! 305: ret = c.execute(
! 306: """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
! 307: (resource, title))
! 308: except:
! 309: try:
! 310: ret = c.execute(
! 311: """UPDATE pages SET title='%s' WHERE resource='%s'""" %
! 312: (title, resource))
! 313: except:
! 314: print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
! 315: print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
! 316: print sys.exc_type, sys.exc_value
! 317: return -1
! 318:
! 319: return ret
! 320:
! 321: def updateWordHTML(name, resource, desc, id, relevance):
! 322: global DB
! 323:
! 324: if DB == None:
! 325: openMySQL()
! 326: if DB == None:
! 327: return -1
! 328: if name == None:
! 329: return -1
! 330: if resource == None:
! 331: return -1
! 332: if id == None:
! 333: id = ""
! 334: if desc == None:
! 335: desc = ""
! 336: else:
! 337: try:
! 338: desc = string.replace(desc, "'", " ")
! 339: desc = desc[0:99]
! 340: except:
! 341: desc = ""
! 342:
! 343: c = DB.cursor()
! 344: try:
! 345: ret = c.execute(
! 346: """INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
! 347: (name, resource, desc, id, relevance))
! 348: except:
! 349: try:
! 350: ret = c.execute(
! 351: """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
! 352: (desc, id, relevance, name, resource))
! 353: except:
! 354: print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
! 355: print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
! 356: print sys.exc_type, sys.exc_value
! 357: return -1
! 358:
! 359: return ret
! 360:
! 361: def checkXMLMsgArchive(url):
! 362: global DB
! 363:
! 364: if DB == None:
! 365: openMySQL()
! 366: if DB == None:
! 367: return -1
! 368: if url == None:
! 369: return -1
! 370:
! 371: c = DB.cursor()
! 372: try:
! 373: ret = c.execute(
! 374: """SELECT ID FROM archives WHERE resource='%s'""" % (url))
! 375: row = c.fetchone()
! 376: if row == None:
! 377: return -1
! 378: except:
! 379: return -1
! 380:
! 381: return row[0]
! 382:
! 383: def addXMLMsgArchive(url, title):
! 384: global DB
! 385:
! 386: if DB == None:
! 387: openMySQL()
! 388: if DB == None:
! 389: return -1
! 390: if url == None:
! 391: return -1
! 392: if title == None:
! 393: title = ""
! 394: else:
! 395: title = string.replace(title, "'", " ")
! 396: title = title[0:99]
! 397:
! 398: c = DB.cursor()
! 399: try:
! 400: cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
! 401: ret = c.execute(cmd)
! 402: cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
! 403: ret = c.execute(cmd)
! 404: row = c.fetchone()
! 405: if row == None:
! 406: print "addXMLMsgArchive failed to get the ID: %s" % (url)
! 407: return -1
! 408: except:
! 409: print "addXMLMsgArchive failed command: %s" % (cmd)
! 410: return -1
! 411:
! 412: return((int)(row[0]))
! 413:
! 414: def updateWordArchive(name, id, relevance):
! 415: global DB
! 416:
! 417: if DB == None:
! 418: openMySQL()
! 419: if DB == None:
! 420: return -1
! 421: if name == None:
! 422: return -1
! 423: if id == None:
! 424: return -1
! 425:
! 426: c = DB.cursor()
! 427: try:
! 428: ret = c.execute(
! 429: """INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
! 430: (name, id, relevance))
! 431: except:
! 432: try:
! 433: ret = c.execute(
! 434: """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
! 435: (relevance, name, id))
! 436: except:
! 437: print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
! 438: print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
! 439: print sys.exc_type, sys.exc_value
! 440: return -1
! 441:
! 442: return ret
! 443:
! 444: #########################################################################
! 445: # #
! 446: # Word dictionnary and analysis routines #
! 447: # #
! 448: #########################################################################
! 449:
! 450: #
! 451: # top 100 english word without the one len < 3 + own set
! 452: #
! 453: dropWords = {
! 454: 'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
! 455: 'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
! 456: 'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
! 457: 'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
! 458: 'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
! 459: 'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
! 460: 'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
! 461: 'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
! 462: 'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
! 463: 'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
! 464: 'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
! 465: 'down':0,
! 466: 'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
! 467: }
! 468:
! 469: wordsDict = {}
! 470: wordsDictHTML = {}
! 471: wordsDictArchive = {}
! 472:
! 473: def cleanupWordsString(str):
! 474: str = string.replace(str, ".", " ")
! 475: str = string.replace(str, "!", " ")
! 476: str = string.replace(str, "?", " ")
! 477: str = string.replace(str, ",", " ")
! 478: str = string.replace(str, "'", " ")
! 479: str = string.replace(str, '"', " ")
! 480: str = string.replace(str, ";", " ")
! 481: str = string.replace(str, "(", " ")
! 482: str = string.replace(str, ")", " ")
! 483: str = string.replace(str, "{", " ")
! 484: str = string.replace(str, "}", " ")
! 485: str = string.replace(str, "<", " ")
! 486: str = string.replace(str, ">", " ")
! 487: str = string.replace(str, "=", " ")
! 488: str = string.replace(str, "/", " ")
! 489: str = string.replace(str, "*", " ")
! 490: str = string.replace(str, ":", " ")
! 491: str = string.replace(str, "#", " ")
! 492: str = string.replace(str, "\\", " ")
! 493: str = string.replace(str, "\n", " ")
! 494: str = string.replace(str, "\r", " ")
! 495: str = string.replace(str, "\xc2", " ")
! 496: str = string.replace(str, "\xa0", " ")
! 497: return str
! 498:
! 499: def cleanupDescrString(str):
! 500: str = string.replace(str, "'", " ")
! 501: str = string.replace(str, "\n", " ")
! 502: str = string.replace(str, "\r", " ")
! 503: str = string.replace(str, "\xc2", " ")
! 504: str = string.replace(str, "\xa0", " ")
! 505: l = string.split(str)
! 506: str = string.join(str)
! 507: return str
! 508:
! 509: def splitIdentifier(str):
! 510: ret = []
! 511: while str != "":
! 512: cur = string.lower(str[0])
! 513: str = str[1:]
! 514: if ((cur < 'a') or (cur > 'z')):
! 515: continue
! 516: while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
! 517: cur = cur + string.lower(str[0])
! 518: str = str[1:]
! 519: while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
! 520: cur = cur + str[0]
! 521: str = str[1:]
! 522: while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
! 523: str = str[1:]
! 524: ret.append(cur)
! 525: return ret
! 526:
! 527: def addWord(word, module, symbol, relevance):
! 528: global wordsDict
! 529:
! 530: if word == None or len(word) < 3:
! 531: return -1
! 532: if module == None or symbol == None:
! 533: return -1
! 534: if dropWords.has_key(word):
! 535: return 0
! 536: if ord(word[0]) > 0x80:
! 537: return 0
! 538:
! 539: if wordsDict.has_key(word):
! 540: d = wordsDict[word]
! 541: if d == None:
! 542: return 0
! 543: if len(d) > 500:
! 544: wordsDict[word] = None
! 545: return 0
! 546: try:
! 547: relevance = relevance + d[(module, symbol)]
! 548: except:
! 549: pass
! 550: else:
! 551: wordsDict[word] = {}
! 552: wordsDict[word][(module, symbol)] = relevance
! 553: return relevance
! 554:
! 555: def addString(str, module, symbol, relevance):
! 556: if str == None or len(str) < 3:
! 557: return -1
! 558: ret = 0
! 559: str = cleanupWordsString(str)
! 560: l = string.split(str)
! 561: for word in l:
! 562: if len(word) > 2:
! 563: ret = ret + addWord(word, module, symbol, 5)
! 564:
! 565: return ret
! 566:
! 567: def addWordHTML(word, resource, id, section, relevance):
! 568: global wordsDictHTML
! 569:
! 570: if word == None or len(word) < 3:
! 571: return -1
! 572: if resource == None or section == None:
! 573: return -1
! 574: if dropWords.has_key(word):
! 575: return 0
! 576: if ord(word[0]) > 0x80:
! 577: return 0
! 578:
! 579: section = cleanupDescrString(section)
! 580:
! 581: if wordsDictHTML.has_key(word):
! 582: d = wordsDictHTML[word]
! 583: if d == None:
! 584: print "skipped %s" % (word)
! 585: return 0
! 586: try:
! 587: (r,i,s) = d[resource]
! 588: if i != None:
! 589: id = i
! 590: if s != None:
! 591: section = s
! 592: relevance = relevance + r
! 593: except:
! 594: pass
! 595: else:
! 596: wordsDictHTML[word] = {}
! 597: d = wordsDictHTML[word];
! 598: d[resource] = (relevance, id, section)
! 599: return relevance
! 600:
! 601: def addStringHTML(str, resource, id, section, relevance):
! 602: if str == None or len(str) < 3:
! 603: return -1
! 604: ret = 0
! 605: str = cleanupWordsString(str)
! 606: l = string.split(str)
! 607: for word in l:
! 608: if len(word) > 2:
! 609: try:
! 610: r = addWordHTML(word, resource, id, section, relevance)
! 611: if r < 0:
! 612: print "addWordHTML failed: %s %s" % (word, resource)
! 613: ret = ret + r
! 614: except:
! 615: print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
! 616: print sys.exc_type, sys.exc_value
! 617:
! 618: return ret
! 619:
! 620: def addWordArchive(word, id, relevance):
! 621: global wordsDictArchive
! 622:
! 623: if word == None or len(word) < 3:
! 624: return -1
! 625: if id == None or id == -1:
! 626: return -1
! 627: if dropWords.has_key(word):
! 628: return 0
! 629: if ord(word[0]) > 0x80:
! 630: return 0
! 631:
! 632: if wordsDictArchive.has_key(word):
! 633: d = wordsDictArchive[word]
! 634: if d == None:
! 635: print "skipped %s" % (word)
! 636: return 0
! 637: try:
! 638: r = d[id]
! 639: relevance = relevance + r
! 640: except:
! 641: pass
! 642: else:
! 643: wordsDictArchive[word] = {}
! 644: d = wordsDictArchive[word];
! 645: d[id] = relevance
! 646: return relevance
! 647:
! 648: def addStringArchive(str, id, relevance):
! 649: if str == None or len(str) < 3:
! 650: return -1
! 651: ret = 0
! 652: str = cleanupWordsString(str)
! 653: l = string.split(str)
! 654: for word in l:
! 655: i = len(word)
! 656: if i > 2:
! 657: try:
! 658: r = addWordArchive(word, id, relevance)
! 659: if r < 0:
! 660: print "addWordArchive failed: %s %s" % (word, id)
! 661: else:
! 662: ret = ret + r
! 663: except:
! 664: print "addWordArchive failed: %s %s %d" % (word, id, relevance)
! 665: print sys.exc_type, sys.exc_value
! 666: return ret
! 667:
! 668: #########################################################################
! 669: # #
! 670: # XML API description analysis #
! 671: # #
! 672: #########################################################################
! 673:
! 674: def loadAPI(filename):
! 675: doc = libxml2.parseFile(filename)
! 676: print "loaded %s" % (filename)
! 677: return doc
! 678:
! 679: def foundExport(file, symbol):
! 680: if file == None:
! 681: return 0
! 682: if symbol == None:
! 683: return 0
! 684: addFunction(symbol, file)
! 685: l = splitIdentifier(symbol)
! 686: for word in l:
! 687: addWord(word, file, symbol, 10)
! 688: return 1
! 689:
! 690: def analyzeAPIFile(top):
! 691: count = 0
! 692: name = top.prop("name")
! 693: cur = top.children
! 694: while cur != None:
! 695: if cur.type == 'text':
! 696: cur = cur.next
! 697: continue
! 698: if cur.name == "exports":
! 699: count = count + foundExport(name, cur.prop("symbol"))
! 700: else:
! 701: print "unexpected element %s in API doc <file name='%s'>" % (name)
! 702: cur = cur.next
! 703: return count
! 704:
! 705: def analyzeAPIFiles(top):
! 706: count = 0
! 707: cur = top.children
! 708:
! 709: while cur != None:
! 710: if cur.type == 'text':
! 711: cur = cur.next
! 712: continue
! 713: if cur.name == "file":
! 714: count = count + analyzeAPIFile(cur)
! 715: else:
! 716: print "unexpected element %s in API doc <files>" % (cur.name)
! 717: cur = cur.next
! 718: return count
! 719:
! 720: def analyzeAPIEnum(top):
! 721: file = top.prop("file")
! 722: if file == None:
! 723: return 0
! 724: symbol = top.prop("name")
! 725: if symbol == None:
! 726: return 0
! 727:
! 728: addEnum(symbol, file)
! 729: l = splitIdentifier(symbol)
! 730: for word in l:
! 731: addWord(word, file, symbol, 10)
! 732:
! 733: return 1
! 734:
! 735: def analyzeAPIConst(top):
! 736: file = top.prop("file")
! 737: if file == None:
! 738: return 0
! 739: symbol = top.prop("name")
! 740: if symbol == None:
! 741: return 0
! 742:
! 743: addConst(symbol, file)
! 744: l = splitIdentifier(symbol)
! 745: for word in l:
! 746: addWord(word, file, symbol, 10)
! 747:
! 748: return 1
! 749:
! 750: def analyzeAPIType(top):
! 751: file = top.prop("file")
! 752: if file == None:
! 753: return 0
! 754: symbol = top.prop("name")
! 755: if symbol == None:
! 756: return 0
! 757:
! 758: addType(symbol, file)
! 759: l = splitIdentifier(symbol)
! 760: for word in l:
! 761: addWord(word, file, symbol, 10)
! 762: return 1
! 763:
! 764: def analyzeAPIFunctype(top):
! 765: file = top.prop("file")
! 766: if file == None:
! 767: return 0
! 768: symbol = top.prop("name")
! 769: if symbol == None:
! 770: return 0
! 771:
! 772: addFunctype(symbol, file)
! 773: l = splitIdentifier(symbol)
! 774: for word in l:
! 775: addWord(word, file, symbol, 10)
! 776: return 1
! 777:
! 778: def analyzeAPIStruct(top):
! 779: file = top.prop("file")
! 780: if file == None:
! 781: return 0
! 782: symbol = top.prop("name")
! 783: if symbol == None:
! 784: return 0
! 785:
! 786: addStruct(symbol, file)
! 787: l = splitIdentifier(symbol)
! 788: for word in l:
! 789: addWord(word, file, symbol, 10)
! 790:
! 791: info = top.prop("info")
! 792: if info != None:
! 793: info = string.replace(info, "'", " ")
! 794: info = string.strip(info)
! 795: l = string.split(info)
! 796: for word in l:
! 797: if len(word) > 2:
! 798: addWord(word, file, symbol, 5)
! 799: return 1
! 800:
! 801: def analyzeAPIMacro(top):
! 802: file = top.prop("file")
! 803: if file == None:
! 804: return 0
! 805: symbol = top.prop("name")
! 806: if symbol == None:
! 807: return 0
! 808: symbol = string.replace(symbol, "'", " ")
! 809: symbol = string.strip(symbol)
! 810:
! 811: info = None
! 812: cur = top.children
! 813: while cur != None:
! 814: if cur.type == 'text':
! 815: cur = cur.next
! 816: continue
! 817: if cur.name == "info":
! 818: info = cur.content
! 819: break
! 820: cur = cur.next
! 821:
! 822: l = splitIdentifier(symbol)
! 823: for word in l:
! 824: addWord(word, file, symbol, 10)
! 825:
! 826: if info == None:
! 827: addMacro(symbol, file)
! 828: print "Macro %s description has no <info>" % (symbol)
! 829: return 0
! 830:
! 831: info = string.replace(info, "'", " ")
! 832: info = string.strip(info)
! 833: addMacro(symbol, file, info)
! 834: l = string.split(info)
! 835: for word in l:
! 836: if len(word) > 2:
! 837: addWord(word, file, symbol, 5)
! 838: return 1
! 839:
! 840: def analyzeAPIFunction(top):
! 841: file = top.prop("file")
! 842: if file == None:
! 843: return 0
! 844: symbol = top.prop("name")
! 845: if symbol == None:
! 846: return 0
! 847:
! 848: symbol = string.replace(symbol, "'", " ")
! 849: symbol = string.strip(symbol)
! 850: info = None
! 851: cur = top.children
! 852: while cur != None:
! 853: if cur.type == 'text':
! 854: cur = cur.next
! 855: continue
! 856: if cur.name == "info":
! 857: info = cur.content
! 858: elif cur.name == "return":
! 859: rinfo = cur.prop("info")
! 860: if rinfo != None:
! 861: rinfo = string.replace(rinfo, "'", " ")
! 862: rinfo = string.strip(rinfo)
! 863: addString(rinfo, file, symbol, 7)
! 864: elif cur.name == "arg":
! 865: ainfo = cur.prop("info")
! 866: if ainfo != None:
! 867: ainfo = string.replace(ainfo, "'", " ")
! 868: ainfo = string.strip(ainfo)
! 869: addString(ainfo, file, symbol, 5)
! 870: name = cur.prop("name")
! 871: if name != None:
! 872: name = string.replace(name, "'", " ")
! 873: name = string.strip(name)
! 874: addWord(name, file, symbol, 7)
! 875: cur = cur.next
! 876: if info == None:
! 877: print "Function %s description has no <info>" % (symbol)
! 878: addFunction(symbol, file, "")
! 879: else:
! 880: info = string.replace(info, "'", " ")
! 881: info = string.strip(info)
! 882: addFunction(symbol, file, info)
! 883: addString(info, file, symbol, 5)
! 884:
! 885: l = splitIdentifier(symbol)
! 886: for word in l:
! 887: addWord(word, file, symbol, 10)
! 888:
! 889: return 1
! 890:
! 891: def analyzeAPISymbols(top):
! 892: count = 0
! 893: cur = top.children
! 894:
! 895: while cur != None:
! 896: if cur.type == 'text':
! 897: cur = cur.next
! 898: continue
! 899: if cur.name == "macro":
! 900: count = count + analyzeAPIMacro(cur)
! 901: elif cur.name == "function":
! 902: count = count + analyzeAPIFunction(cur)
! 903: elif cur.name == "const":
! 904: count = count + analyzeAPIConst(cur)
! 905: elif cur.name == "typedef":
! 906: count = count + analyzeAPIType(cur)
! 907: elif cur.name == "struct":
! 908: count = count + analyzeAPIStruct(cur)
! 909: elif cur.name == "enum":
! 910: count = count + analyzeAPIEnum(cur)
! 911: elif cur.name == "functype":
! 912: count = count + analyzeAPIFunctype(cur)
! 913: else:
! 914: print "unexpected element %s in API doc <files>" % (cur.name)
! 915: cur = cur.next
! 916: return count
! 917:
! 918: def analyzeAPI(doc):
! 919: count = 0
! 920: if doc == None:
! 921: return -1
! 922: root = doc.getRootElement()
! 923: if root.name != "api":
! 924: print "Unexpected root name"
! 925: return -1
! 926: cur = root.children
! 927: while cur != None:
! 928: if cur.type == 'text':
! 929: cur = cur.next
! 930: continue
! 931: if cur.name == "files":
! 932: pass
! 933: # count = count + analyzeAPIFiles(cur)
! 934: elif cur.name == "symbols":
! 935: count = count + analyzeAPISymbols(cur)
! 936: else:
! 937: print "unexpected element %s in API doc" % (cur.name)
! 938: cur = cur.next
! 939: return count
! 940:
! 941: #########################################################################
! 942: # #
! 943: # Web pages parsing and analysis #
! 944: # #
! 945: #########################################################################
! 946:
! 947: import glob
! 948:
! 949: def analyzeHTMLText(doc, resource, p, section, id):
! 950: words = 0
! 951: try:
! 952: content = p.content
! 953: words = words + addStringHTML(content, resource, id, section, 5)
! 954: except:
! 955: return -1
! 956: return words
! 957:
! 958: def analyzeHTMLPara(doc, resource, p, section, id):
! 959: words = 0
! 960: try:
! 961: content = p.content
! 962: words = words + addStringHTML(content, resource, id, section, 5)
! 963: except:
! 964: return -1
! 965: return words
! 966:
! 967: def analyzeHTMLPre(doc, resource, p, section, id):
! 968: words = 0
! 969: try:
! 970: content = p.content
! 971: words = words + addStringHTML(content, resource, id, section, 5)
! 972: except:
! 973: return -1
! 974: return words
! 975:
! 976: def analyzeHTML(doc, resource, p, section, id):
! 977: words = 0
! 978: try:
! 979: content = p.content
! 980: words = words + addStringHTML(content, resource, id, section, 5)
! 981: except:
! 982: return -1
! 983: return words
! 984:
! 985: def analyzeHTML(doc, resource):
! 986: para = 0;
! 987: ctxt = doc.xpathNewContext()
! 988: try:
! 989: res = ctxt.xpathEval("//head/title")
! 990: title = res[0].content
! 991: except:
! 992: title = "Page %s" % (resource)
! 993: addPage(resource, title)
! 994: try:
! 995: items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
! 996: section = title
! 997: id = ""
! 998: for item in items:
! 999: if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
! 1000: section = item.content
! 1001: if item.prop("id"):
! 1002: id = item.prop("id")
! 1003: elif item.prop("name"):
! 1004: id = item.prop("name")
! 1005: elif item.type == 'text':
! 1006: analyzeHTMLText(doc, resource, item, section, id)
! 1007: para = para + 1
! 1008: elif item.name == 'p':
! 1009: analyzeHTMLPara(doc, resource, item, section, id)
! 1010: para = para + 1
! 1011: elif item.name == 'pre':
! 1012: analyzeHTMLPre(doc, resource, item, section, id)
! 1013: para = para + 1
! 1014: else:
! 1015: print "Page %s, unexpected %s element" % (resource, item.name)
! 1016: except:
! 1017: print "Page %s: problem analyzing" % (resource)
! 1018: print sys.exc_type, sys.exc_value
! 1019:
! 1020: return para
! 1021:
! 1022: def analyzeHTMLPages():
! 1023: ret = 0
! 1024: HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
! 1025: for html in HTMLfiles:
! 1026: if html[0:3] == "API":
! 1027: continue
! 1028: if html == "xml.html":
! 1029: continue
! 1030: try:
! 1031: doc = libxml2.parseFile(html)
! 1032: except:
! 1033: doc = libxml2.htmlParseFile(html, None)
! 1034: try:
! 1035: res = analyzeHTML(doc, html)
! 1036: print "Parsed %s : %d paragraphs" % (html, res)
! 1037: ret = ret + 1
! 1038: except:
! 1039: print "could not parse %s" % (html)
! 1040: return ret
! 1041:
! 1042: #########################################################################
! 1043: # #
! 1044: # Mail archives parsing and analysis #
! 1045: # #
! 1046: #########################################################################
! 1047:
! 1048: import time
! 1049:
! 1050: def getXMLDateArchive(t = None):
! 1051: if t == None:
! 1052: t = time.time()
! 1053: T = time.gmtime(t)
! 1054: month = time.strftime("%B", T)
! 1055: year = T[0]
! 1056: url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
! 1057: return url
! 1058:
! 1059: def scanXMLMsgArchive(url, title, force = 0):
! 1060: if url == None or title == None:
! 1061: return 0
! 1062:
! 1063: ID = checkXMLMsgArchive(url)
! 1064: if force == 0 and ID != -1:
! 1065: return 0
! 1066:
! 1067: if ID == -1:
! 1068: ID = addXMLMsgArchive(url, title)
! 1069: if ID == -1:
! 1070: return 0
! 1071:
! 1072: try:
! 1073: print "Loading %s" % (url)
! 1074: doc = libxml2.htmlParseFile(url, None);
! 1075: except:
! 1076: doc = None
! 1077: if doc == None:
! 1078: print "Failed to parse %s" % (url)
! 1079: return 0
! 1080:
! 1081: addStringArchive(title, ID, 20)
! 1082: ctxt = doc.xpathNewContext()
! 1083: texts = ctxt.xpathEval("//pre//text()")
! 1084: for text in texts:
! 1085: addStringArchive(text.content, ID, 5)
! 1086:
! 1087: return 1
! 1088:
! 1089: def scanXMLDateArchive(t = None, force = 0):
! 1090: global wordsDictArchive
! 1091:
! 1092: wordsDictArchive = {}
! 1093:
! 1094: url = getXMLDateArchive(t)
! 1095: print "loading %s" % (url)
! 1096: try:
! 1097: doc = libxml2.htmlParseFile(url, None);
! 1098: except:
! 1099: doc = None
! 1100: if doc == None:
! 1101: print "Failed to parse %s" % (url)
! 1102: return -1
! 1103: ctxt = doc.xpathNewContext()
! 1104: anchors = ctxt.xpathEval("//a[@href]")
! 1105: links = 0
! 1106: newmsg = 0
! 1107: for anchor in anchors:
! 1108: href = anchor.prop("href")
! 1109: if href == None or href[0:3] != "msg":
! 1110: continue
! 1111: try:
! 1112: links = links + 1
! 1113:
! 1114: msg = libxml2.buildURI(href, url)
! 1115: title = anchor.content
! 1116: if title != None and title[0:4] == 'Re: ':
! 1117: title = title[4:]
! 1118: if title != None and title[0:6] == '[xml] ':
! 1119: title = title[6:]
! 1120: newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
! 1121:
! 1122: except:
! 1123: pass
! 1124:
! 1125: return newmsg
! 1126:
! 1127:
! 1128: #########################################################################
! 1129: # #
! 1130: # Main code: open the DB, the API XML and analyze it #
! 1131: # #
! 1132: #########################################################################
! 1133: def analyzeArchives(t = None, force = 0):
! 1134: global wordsDictArchive
! 1135:
! 1136: ret = scanXMLDateArchive(t, force)
! 1137: print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
! 1138:
! 1139: i = 0
! 1140: skipped = 0
! 1141: for word in wordsDictArchive.keys():
! 1142: refs = wordsDictArchive[word]
! 1143: if refs == None:
! 1144: skipped = skipped + 1
! 1145: continue;
! 1146: for id in refs.keys():
! 1147: relevance = refs[id]
! 1148: updateWordArchive(word, id, relevance)
! 1149: i = i + 1
! 1150:
! 1151: print "Found %d associations in HTML pages" % (i)
! 1152:
! 1153: def analyzeHTMLTop():
! 1154: global wordsDictHTML
! 1155:
! 1156: ret = analyzeHTMLPages()
! 1157: print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
! 1158:
! 1159: i = 0
! 1160: skipped = 0
! 1161: for word in wordsDictHTML.keys():
! 1162: refs = wordsDictHTML[word]
! 1163: if refs == None:
! 1164: skipped = skipped + 1
! 1165: continue;
! 1166: for resource in refs.keys():
! 1167: (relevance, id, section) = refs[resource]
! 1168: updateWordHTML(word, resource, section, id, relevance)
! 1169: i = i + 1
! 1170:
! 1171: print "Found %d associations in HTML pages" % (i)
! 1172:
! 1173: def analyzeAPITop():
! 1174: global wordsDict
! 1175: global API
! 1176:
! 1177: try:
! 1178: doc = loadAPI(API)
! 1179: ret = analyzeAPI(doc)
! 1180: print "Analyzed %d blocs" % (ret)
! 1181: doc.freeDoc()
! 1182: except:
! 1183: print "Failed to parse and analyze %s" % (API)
! 1184: print sys.exc_type, sys.exc_value
! 1185: sys.exit(1)
! 1186:
! 1187: print "Indexed %d words" % (len(wordsDict))
! 1188: i = 0
! 1189: skipped = 0
! 1190: for word in wordsDict.keys():
! 1191: refs = wordsDict[word]
! 1192: if refs == None:
! 1193: skipped = skipped + 1
! 1194: continue;
! 1195: for (module, symbol) in refs.keys():
! 1196: updateWord(word, symbol, refs[(module, symbol)])
! 1197: i = i + 1
! 1198:
! 1199: print "Found %d associations, skipped %d words" % (i, skipped)
! 1200:
! 1201: def usage():
! 1202: print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]"
! 1203: sys.exit(1)
! 1204:
! 1205: def main():
! 1206: try:
! 1207: openMySQL()
! 1208: except:
! 1209: print "Failed to open the database"
! 1210: print sys.exc_type, sys.exc_value
! 1211: sys.exit(1)
! 1212:
! 1213: args = sys.argv[1:]
! 1214: force = 0
! 1215: if args:
! 1216: i = 0
! 1217: while i < len(args):
! 1218: if args[i] == '--force':
! 1219: force = 1
! 1220: elif args[i] == '--archive':
! 1221: analyzeArchives(None, force)
! 1222: elif args[i] == '--archive-year':
! 1223: i = i + 1;
! 1224: year = args[i]
! 1225: months = ["January" , "February", "March", "April", "May",
! 1226: "June", "July", "August", "September", "October",
! 1227: "November", "December"];
! 1228: for month in months:
! 1229: try:
! 1230: str = "%s-%s" % (year, month)
! 1231: T = time.strptime(str, "%Y-%B")
! 1232: t = time.mktime(T) + 3600 * 24 * 10;
! 1233: analyzeArchives(t, force)
! 1234: except:
! 1235: print "Failed to index month archive:"
! 1236: print sys.exc_type, sys.exc_value
! 1237: elif args[i] == '--archive-month':
! 1238: i = i + 1;
! 1239: month = args[i]
! 1240: try:
! 1241: T = time.strptime(month, "%Y-%B")
! 1242: t = time.mktime(T) + 3600 * 24 * 10;
! 1243: analyzeArchives(t, force)
! 1244: except:
! 1245: print "Failed to index month archive:"
! 1246: print sys.exc_type, sys.exc_value
! 1247: elif args[i] == '--API':
! 1248: analyzeAPITop()
! 1249: elif args[i] == '--docs':
! 1250: analyzeHTMLTop()
! 1251: else:
! 1252: usage()
! 1253: i = i + 1
! 1254: else:
! 1255: usage()
! 1256:
! 1257: if __name__ == "__main__":
! 1258: main()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>