embedaddon/libxml2/doc/index.py - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / libxml2 / doc / index.py
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 23:37:59 2012 UTC (12 years, 4 months ago) by misho
Branches: libxml2, MAIN
CVS tags: v2_9_1p0, v2_9_1, v2_8_0p0, v2_8_0, v2_7_8, HEAD

libxml2

1: #!/usr/bin/python -u 2: # 3: # imports the API description and fills up a database with 4: # name relevance to modules, functions or web pages 5: # 6: # Operation needed: 7: # ================= 8: # 9: # install mysqld, the python wrappers for mysql and libxml2, start mysqld 10: # Change the root passwd of mysql: 11: # mysqladmin -u root password new_password 12: # Create the new database xmlsoft 13: # mysqladmin -p create xmlsoft 14: # Create a database user 'veillard' and give him passord access 15: # change veillard and abcde with the right user name and passwd 16: # mysql -p 17: # password: 18: # mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost 19: # IDENTIFIED BY 'abcde' WITH GRANT OPTION; 20: # 21: # As the user check the access: 22: # mysql -p xmlsoft 23: # Enter password: 24: # Welcome to the MySQL monitor.... 25: # mysql> use xmlsoft 26: # Database changed 27: # mysql> quit 28: # Bye 29: # 30: # Then run the script in the doc subdir, it will create the symbols and 31: # word tables and populate them with informations extracted from 32: # the libxml2-api.xml API description, and make then accessible read-only 33: # by nobody@loaclhost the user expected to be Apache's one 34: # 35: # On the Apache configuration, make sure you have php support enabled 36: # 37: 38: import MySQLdb 39: import libxml2 40: import sys 41: import string 42: import os 43: 44: # 45: # We are not interested in parsing errors here 46: # 47: def callback(ctx, str): 48: return 49: libxml2.registerErrorHandler(callback, None) 50: 51: # 52: # The dictionnary of tables required and the SQL command needed 53: # to create them 54: # 55: TABLES={ 56: "symbols" : """CREATE TABLE symbols ( 57: name varchar(255) BINARY NOT NULL, 58: module varchar(255) BINARY NOT NULL, 59: type varchar(25) NOT NULL, 60: descr varchar(255), 61: UNIQUE KEY name (name), 62: KEY module (module))""", 63: "words" : """CREATE TABLE words ( 64: name varchar(50) BINARY NOT NULL, 65: symbol varchar(255) BINARY NOT NULL, 66: relevance int, 67: KEY name (name), 68: KEY symbol (symbol), 69: UNIQUE KEY ID (name, symbol))""", 70: "wordsHTML" : """CREATE TABLE wordsHTML ( 71: name varchar(50) BINARY NOT NULL, 72: resource varchar(255) BINARY NOT NULL, 73: section varchar(255), 74: id varchar(50), 75: relevance int, 76: KEY name (name), 77: KEY resource (resource), 78: UNIQUE KEY ref (name, resource))""", 79: "wordsArchive" : """CREATE TABLE wordsArchive ( 80: name varchar(50) BINARY NOT NULL, 81: ID int(11) NOT NULL, 82: relevance int, 83: KEY name (name), 84: UNIQUE KEY ref (name, ID))""", 85: "pages" : """CREATE TABLE pages ( 86: resource varchar(255) BINARY NOT NULL, 87: title varchar(255) BINARY NOT NULL, 88: UNIQUE KEY name (resource))""", 89: "archives" : """CREATE TABLE archives ( 90: ID int(11) NOT NULL auto_increment, 91: resource varchar(255) BINARY NOT NULL, 92: title varchar(255) BINARY NOT NULL, 93: UNIQUE KEY id (ID,resource(255)), 94: INDEX (ID), 95: INDEX (resource))""", 96: "Queries" : """CREATE TABLE Queries ( 97: ID int(11) NOT NULL auto_increment, 98: Value varchar(50) NOT NULL, 99: Count int(11) NOT NULL, 100: UNIQUE KEY id (ID,Value(35)), 101: INDEX (ID))""", 102: "AllQueries" : """CREATE TABLE AllQueries ( 103: ID int(11) NOT NULL auto_increment, 104: Value varchar(50) NOT NULL, 105: Count int(11) NOT NULL, 106: UNIQUE KEY id (ID,Value(35)), 107: INDEX (ID))""", 108: } 109: 110: # 111: # The XML API description file to parse 112: # 113: API="libxml2-api.xml" 114: DB=None 115: 116: ######################################################################### 117: # # 118: # MySQL database interfaces # 119: # # 120: ######################################################################### 121: def createTable(db, name): 122: global TABLES 123: 124: if db == None: 125: return -1 126: if name == None: 127: return -1 128: c = db.cursor() 129: 130: ret = c.execute("DROP TABLE IF EXISTS %s" % (name)) 131: if ret == 1: 132: print "Removed table %s" % (name) 133: print "Creating table %s" % (name) 134: try: 135: ret = c.execute(TABLES[name]) 136: except: 137: print "Failed to create table %s" % (name) 138: return -1 139: return ret 140: 141: def checkTables(db, verbose = 1): 142: global TABLES 143: 144: if db == None: 145: return -1 146: c = db.cursor() 147: nbtables = c.execute("show tables") 148: if verbose: 149: print "Found %d tables" % (nbtables) 150: tables = {} 151: i = 0 152: while i < nbtables: 153: l = c.fetchone() 154: name = l[0] 155: tables[name] = {} 156: i = i + 1 157: 158: for table in TABLES.keys(): 159: if not tables.has_key(table): 160: print "table %s missing" % (table) 161: createTable(db, table) 162: try: 163: ret = c.execute("SELECT count(*) from %s" % table); 164: row = c.fetchone() 165: if verbose: 166: print "Table %s contains %d records" % (table, row[0]) 167: except: 168: print "Troubles with table %s : repairing" % (table) 169: ret = c.execute("repair table %s" % table); 170: print "repairing returned %d" % (ret) 171: ret = c.execute("SELECT count(*) from %s" % table); 172: row = c.fetchone() 173: print "Table %s contains %d records" % (table, row[0]) 174: if verbose: 175: print "checkTables finished" 176: 177: # make sure apache can access the tables read-only 178: try: 179: ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost") 180: ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost") 181: except: 182: pass 183: return 0 184: 185: def openMySQL(db="xmlsoft", passwd=None, verbose = 1): 186: global DB 187: 188: if passwd == None: 189: try: 190: passwd = os.environ["MySQL_PASS"] 191: except: 192: print "No password available, set environment MySQL_PASS" 193: sys.exit(1) 194: 195: DB = MySQLdb.connect(passwd=passwd, db=db) 196: if DB == None: 197: return -1 198: ret = checkTables(DB, verbose) 199: return ret 200: 201: def updateWord(name, symbol, relevance): 202: global DB 203: 204: if DB == None: 205: openMySQL() 206: if DB == None: 207: return -1 208: if name == None: 209: return -1 210: if symbol == None: 211: return -1 212: 213: c = DB.cursor() 214: try: 215: ret = c.execute( 216: """INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" % 217: (name, symbol, relevance)) 218: except: 219: try: 220: ret = c.execute( 221: """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" % 222: (relevance, name, symbol)) 223: except: 224: print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance) 225: print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol) 226: print sys.exc_type, sys.exc_value 227: return -1 228: 229: return ret 230: 231: def updateSymbol(name, module, type, desc): 232: global DB 233: 234: updateWord(name, name, 50) 235: if DB == None: 236: openMySQL() 237: if DB == None: 238: return -1 239: if name == None: 240: return -1 241: if module == None: 242: return -1 243: if type == None: 244: return -1 245: 246: try: 247: desc = string.replace(desc, "'", " ") 248: l = string.split(desc, ".") 249: desc = l[0] 250: desc = desc[0:99] 251: except: 252: desc = "" 253: 254: c = DB.cursor() 255: try: 256: ret = c.execute( 257: """INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" % 258: (name, module, type, desc)) 259: except: 260: try: 261: ret = c.execute( 262: """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % 263: (module, type, desc, name)) 264: except: 265: print "Update symbol (%s, %s, %s) failed command" % (name, module, type) 266: print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name) 267: print sys.exc_type, sys.exc_value 268: return -1 269: 270: return ret 271: 272: def addFunction(name, module, desc = ""): 273: return updateSymbol(name, module, 'function', desc) 274: 275: def addMacro(name, module, desc = ""): 276: return updateSymbol(name, module, 'macro', desc) 277: 278: def addEnum(name, module, desc = ""): 279: return updateSymbol(name, module, 'enum', desc) 280: 281: def addStruct(name, module, desc = ""): 282: return updateSymbol(name, module, 'struct', desc) 283: 284: def addConst(name, module, desc = ""): 285: return updateSymbol(name, module, 'const', desc) 286: 287: def addType(name, module, desc = ""): 288: return updateSymbol(name, module, 'type', desc) 289: 290: def addFunctype(name, module, desc = ""): 291: return updateSymbol(name, module, 'functype', desc) 292: 293: def addPage(resource, title): 294: global DB 295: 296: if DB == None: 297: openMySQL() 298: if DB == None: 299: return -1 300: if resource == None: 301: return -1 302: 303: c = DB.cursor() 304: try: 305: ret = c.execute( 306: """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" % 307: (resource, title)) 308: except: 309: try: 310: ret = c.execute( 311: """UPDATE pages SET title='%s' WHERE resource='%s'""" % 312: (title, resource)) 313: except: 314: print "Update symbol (%s, %s, %s) failed command" % (name, module, type) 315: print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource) 316: print sys.exc_type, sys.exc_value 317: return -1 318: 319: return ret 320: 321: def updateWordHTML(name, resource, desc, id, relevance): 322: global DB 323: 324: if DB == None: 325: openMySQL() 326: if DB == None: 327: return -1 328: if name == None: 329: return -1 330: if resource == None: 331: return -1 332: if id == None: 333: id = "" 334: if desc == None: 335: desc = "" 336: else: 337: try: 338: desc = string.replace(desc, "'", " ") 339: desc = desc[0:99] 340: except: 341: desc = "" 342: 343: c = DB.cursor() 344: try: 345: ret = c.execute( 346: """INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" % 347: (name, resource, desc, id, relevance)) 348: except: 349: try: 350: ret = c.execute( 351: """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % 352: (desc, id, relevance, name, resource)) 353: except: 354: print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance) 355: print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource) 356: print sys.exc_type, sys.exc_value 357: return -1 358: 359: return ret 360: 361: def checkXMLMsgArchive(url): 362: global DB 363: 364: if DB == None: 365: openMySQL() 366: if DB == None: 367: return -1 368: if url == None: 369: return -1 370: 371: c = DB.cursor() 372: try: 373: ret = c.execute( 374: """SELECT ID FROM archives WHERE resource='%s'""" % (url)) 375: row = c.fetchone() 376: if row == None: 377: return -1 378: except: 379: return -1 380: 381: return row[0] 382: 383: def addXMLMsgArchive(url, title): 384: global DB 385: 386: if DB == None: 387: openMySQL() 388: if DB == None: 389: return -1 390: if url == None: 391: return -1 392: if title == None: 393: title = "" 394: else: 395: title = string.replace(title, "'", " ") 396: title = title[0:99] 397: 398: c = DB.cursor() 399: try: 400: cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title) 401: ret = c.execute(cmd) 402: cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url) 403: ret = c.execute(cmd) 404: row = c.fetchone() 405: if row == None: 406: print "addXMLMsgArchive failed to get the ID: %s" % (url) 407: return -1 408: except: 409: print "addXMLMsgArchive failed command: %s" % (cmd) 410: return -1 411: 412: return((int)(row[0])) 413: 414: def updateWordArchive(name, id, relevance): 415: global DB 416: 417: if DB == None: 418: openMySQL() 419: if DB == None: 420: return -1 421: if name == None: 422: return -1 423: if id == None: 424: return -1 425: 426: c = DB.cursor() 427: try: 428: ret = c.execute( 429: """INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" % 430: (name, id, relevance)) 431: except: 432: try: 433: ret = c.execute( 434: """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % 435: (relevance, name, id)) 436: except: 437: print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance) 438: print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id) 439: print sys.exc_type, sys.exc_value 440: return -1 441: 442: return ret 443: 444: ######################################################################### 445: # # 446: # Word dictionnary and analysis routines # 447: # # 448: ######################################################################### 449: 450: # 451: # top 100 english word without the one len < 3 + own set 452: # 453: dropWords = { 454: 'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0, 455: 'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0, 456: 'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0, 457: 'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0, 458: 'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0, 459: 'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0, 460: 'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0, 461: 'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0, 462: 'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0, 463: 'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0, 464: 'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0, 465: 'down':0, 466: 'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0, 467: } 468: 469: wordsDict = {} 470: wordsDictHTML = {} 471: wordsDictArchive = {} 472: 473: def cleanupWordsString(str): 474: str = string.replace(str, ".", " ") 475: str = string.replace(str, "!", " ") 476: str = string.replace(str, "?", " ") 477: str = string.replace(str, ",", " ") 478: str = string.replace(str, "'", " ") 479: str = string.replace(str, '"', " ") 480: str = string.replace(str, ";", " ") 481: str = string.replace(str, "(", " ") 482: str = string.replace(str, ")", " ") 483: str = string.replace(str, "{", " ") 484: str = string.replace(str, "}", " ") 485: str = string.replace(str, "<", " ") 486: str = string.replace(str, ">", " ") 487: str = string.replace(str, "=", " ") 488: str = string.replace(str, "/", " ") 489: str = string.replace(str, "*", " ") 490: str = string.replace(str, ":", " ") 491: str = string.replace(str, "#", " ") 492: str = string.replace(str, "\\", " ") 493: str = string.replace(str, "\n", " ") 494: str = string.replace(str, "\r", " ") 495: str = string.replace(str, "\xc2", " ") 496: str = string.replace(str, "\xa0", " ") 497: return str 498: 499: def cleanupDescrString(str): 500: str = string.replace(str, "'", " ") 501: str = string.replace(str, "\n", " ") 502: str = string.replace(str, "\r", " ") 503: str = string.replace(str, "\xc2", " ") 504: str = string.replace(str, "\xa0", " ") 505: l = string.split(str) 506: str = string.join(str) 507: return str 508: 509: def splitIdentifier(str): 510: ret = [] 511: while str != "": 512: cur = string.lower(str[0]) 513: str = str[1:] 514: if ((cur < 'a') or (cur > 'z')): 515: continue 516: while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'): 517: cur = cur + string.lower(str[0]) 518: str = str[1:] 519: while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'): 520: cur = cur + str[0] 521: str = str[1:] 522: while (str != "") and (str[0] >= '0') and (str[0] <= '9'): 523: str = str[1:] 524: ret.append(cur) 525: return ret 526: 527: def addWord(word, module, symbol, relevance): 528: global wordsDict 529: 530: if word == None or len(word) < 3: 531: return -1 532: if module == None or symbol == None: 533: return -1 534: if dropWords.has_key(word): 535: return 0 536: if ord(word[0]) > 0x80: 537: return 0 538: 539: if wordsDict.has_key(word): 540: d = wordsDict[word] 541: if d == None: 542: return 0 543: if len(d) > 500: 544: wordsDict[word] = None 545: return 0 546: try: 547: relevance = relevance + d[(module, symbol)] 548: except: 549: pass 550: else: 551: wordsDict[word] = {} 552: wordsDict[word][(module, symbol)] = relevance 553: return relevance 554: 555: def addString(str, module, symbol, relevance): 556: if str == None or len(str) < 3: 557: return -1 558: ret = 0 559: str = cleanupWordsString(str) 560: l = string.split(str) 561: for word in l: 562: if len(word) > 2: 563: ret = ret + addWord(word, module, symbol, 5) 564: 565: return ret 566: 567: def addWordHTML(word, resource, id, section, relevance): 568: global wordsDictHTML 569: 570: if word == None or len(word) < 3: 571: return -1 572: if resource == None or section == None: 573: return -1 574: if dropWords.has_key(word): 575: return 0 576: if ord(word[0]) > 0x80: 577: return 0 578: 579: section = cleanupDescrString(section) 580: 581: if wordsDictHTML.has_key(word): 582: d = wordsDictHTML[word] 583: if d == None: 584: print "skipped %s" % (word) 585: return 0 586: try: 587: (r,i,s) = d[resource] 588: if i != None: 589: id = i 590: if s != None: 591: section = s 592: relevance = relevance + r 593: except: 594: pass 595: else: 596: wordsDictHTML[word] = {} 597: d = wordsDictHTML[word]; 598: d[resource] = (relevance, id, section) 599: return relevance 600: 601: def addStringHTML(str, resource, id, section, relevance): 602: if str == None or len(str) < 3: 603: return -1 604: ret = 0 605: str = cleanupWordsString(str) 606: l = string.split(str) 607: for word in l: 608: if len(word) > 2: 609: try: 610: r = addWordHTML(word, resource, id, section, relevance) 611: if r < 0: 612: print "addWordHTML failed: %s %s" % (word, resource) 613: ret = ret + r 614: except: 615: print "addWordHTML failed: %s %s %d" % (word, resource, relevance) 616: print sys.exc_type, sys.exc_value 617: 618: return ret 619: 620: def addWordArchive(word, id, relevance): 621: global wordsDictArchive 622: 623: if word == None or len(word) < 3: 624: return -1 625: if id == None or id == -1: 626: return -1 627: if dropWords.has_key(word): 628: return 0 629: if ord(word[0]) > 0x80: 630: return 0 631: 632: if wordsDictArchive.has_key(word): 633: d = wordsDictArchive[word] 634: if d == None: 635: print "skipped %s" % (word) 636: return 0 637: try: 638: r = d[id] 639: relevance = relevance + r 640: except: 641: pass 642: else: 643: wordsDictArchive[word] = {} 644: d = wordsDictArchive[word]; 645: d[id] = relevance 646: return relevance 647: 648: def addStringArchive(str, id, relevance): 649: if str == None or len(str) < 3: 650: return -1 651: ret = 0 652: str = cleanupWordsString(str) 653: l = string.split(str) 654: for word in l: 655: i = len(word) 656: if i > 2: 657: try: 658: r = addWordArchive(word, id, relevance) 659: if r < 0: 660: print "addWordArchive failed: %s %s" % (word, id) 661: else: 662: ret = ret + r 663: except: 664: print "addWordArchive failed: %s %s %d" % (word, id, relevance) 665: print sys.exc_type, sys.exc_value 666: return ret 667: 668: ######################################################################### 669: # # 670: # XML API description analysis # 671: # # 672: ######################################################################### 673: 674: def loadAPI(filename): 675: doc = libxml2.parseFile(filename) 676: print "loaded %s" % (filename) 677: return doc 678: 679: def foundExport(file, symbol): 680: if file == None: 681: return 0 682: if symbol == None: 683: return 0 684: addFunction(symbol, file) 685: l = splitIdentifier(symbol) 686: for word in l: 687: addWord(word, file, symbol, 10) 688: return 1 689: 690: def analyzeAPIFile(top): 691: count = 0 692: name = top.prop("name") 693: cur = top.children 694: while cur != None: 695: if cur.type == 'text': 696: cur = cur.next 697: continue 698: if cur.name == "exports": 699: count = count + foundExport(name, cur.prop("symbol")) 700: else: 701: print "unexpected element %s in API doc <file name='%s'>" % (name) 702: cur = cur.next 703: return count 704: 705: def analyzeAPIFiles(top): 706: count = 0 707: cur = top.children 708: 709: while cur != None: 710: if cur.type == 'text': 711: cur = cur.next 712: continue 713: if cur.name == "file": 714: count = count + analyzeAPIFile(cur) 715: else: 716: print "unexpected element %s in API doc <files>" % (cur.name) 717: cur = cur.next 718: return count 719: 720: def analyzeAPIEnum(top): 721: file = top.prop("file") 722: if file == None: 723: return 0 724: symbol = top.prop("name") 725: if symbol == None: 726: return 0 727: 728: addEnum(symbol, file) 729: l = splitIdentifier(symbol) 730: for word in l: 731: addWord(word, file, symbol, 10) 732: 733: return 1 734: 735: def analyzeAPIConst(top): 736: file = top.prop("file") 737: if file == None: 738: return 0 739: symbol = top.prop("name") 740: if symbol == None: 741: return 0 742: 743: addConst(symbol, file) 744: l = splitIdentifier(symbol) 745: for word in l: 746: addWord(word, file, symbol, 10) 747: 748: return 1 749: 750: def analyzeAPIType(top): 751: file = top.prop("file") 752: if file == None: 753: return 0 754: symbol = top.prop("name") 755: if symbol == None: 756: return 0 757: 758: addType(symbol, file) 759: l = splitIdentifier(symbol) 760: for word in l: 761: addWord(word, file, symbol, 10) 762: return 1 763: 764: def analyzeAPIFunctype(top): 765: file = top.prop("file") 766: if file == None: 767: return 0 768: symbol = top.prop("name") 769: if symbol == None: 770: return 0 771: 772: addFunctype(symbol, file) 773: l = splitIdentifier(symbol) 774: for word in l: 775: addWord(word, file, symbol, 10) 776: return 1 777: 778: def analyzeAPIStruct(top): 779: file = top.prop("file") 780: if file == None: 781: return 0 782: symbol = top.prop("name") 783: if symbol == None: 784: return 0 785: 786: addStruct(symbol, file) 787: l = splitIdentifier(symbol) 788: for word in l: 789: addWord(word, file, symbol, 10) 790: 791: info = top.prop("info") 792: if info != None: 793: info = string.replace(info, "'", " ") 794: info = string.strip(info) 795: l = string.split(info) 796: for word in l: 797: if len(word) > 2: 798: addWord(word, file, symbol, 5) 799: return 1 800: 801: def analyzeAPIMacro(top): 802: file = top.prop("file") 803: if file == None: 804: return 0 805: symbol = top.prop("name") 806: if symbol == None: 807: return 0 808: symbol = string.replace(symbol, "'", " ") 809: symbol = string.strip(symbol) 810: 811: info = None 812: cur = top.children 813: while cur != None: 814: if cur.type == 'text': 815: cur = cur.next 816: continue 817: if cur.name == "info": 818: info = cur.content 819: break 820: cur = cur.next 821: 822: l = splitIdentifier(symbol) 823: for word in l: 824: addWord(word, file, symbol, 10) 825: 826: if info == None: 827: addMacro(symbol, file) 828: print "Macro %s description has no <info>" % (symbol) 829: return 0 830: 831: info = string.replace(info, "'", " ") 832: info = string.strip(info) 833: addMacro(symbol, file, info) 834: l = string.split(info) 835: for word in l: 836: if len(word) > 2: 837: addWord(word, file, symbol, 5) 838: return 1 839: 840: def analyzeAPIFunction(top): 841: file = top.prop("file") 842: if file == None: 843: return 0 844: symbol = top.prop("name") 845: if symbol == None: 846: return 0 847: 848: symbol = string.replace(symbol, "'", " ") 849: symbol = string.strip(symbol) 850: info = None 851: cur = top.children 852: while cur != None: 853: if cur.type == 'text': 854: cur = cur.next 855: continue 856: if cur.name == "info": 857: info = cur.content 858: elif cur.name == "return": 859: rinfo = cur.prop("info") 860: if rinfo != None: 861: rinfo = string.replace(rinfo, "'", " ") 862: rinfo = string.strip(rinfo) 863: addString(rinfo, file, symbol, 7) 864: elif cur.name == "arg": 865: ainfo = cur.prop("info") 866: if ainfo != None: 867: ainfo = string.replace(ainfo, "'", " ") 868: ainfo = string.strip(ainfo) 869: addString(ainfo, file, symbol, 5) 870: name = cur.prop("name") 871: if name != None: 872: name = string.replace(name, "'", " ") 873: name = string.strip(name) 874: addWord(name, file, symbol, 7) 875: cur = cur.next 876: if info == None: 877: print "Function %s description has no <info>" % (symbol) 878: addFunction(symbol, file, "") 879: else: 880: info = string.replace(info, "'", " ") 881: info = string.strip(info) 882: addFunction(symbol, file, info) 883: addString(info, file, symbol, 5) 884: 885: l = splitIdentifier(symbol) 886: for word in l: 887: addWord(word, file, symbol, 10) 888: 889: return 1 890: 891: def analyzeAPISymbols(top): 892: count = 0 893: cur = top.children 894: 895: while cur != None: 896: if cur.type == 'text': 897: cur = cur.next 898: continue 899: if cur.name == "macro": 900: count = count + analyzeAPIMacro(cur) 901: elif cur.name == "function": 902: count = count + analyzeAPIFunction(cur) 903: elif cur.name == "const": 904: count = count + analyzeAPIConst(cur) 905: elif cur.name == "typedef": 906: count = count + analyzeAPIType(cur) 907: elif cur.name == "struct": 908: count = count + analyzeAPIStruct(cur) 909: elif cur.name == "enum": 910: count = count + analyzeAPIEnum(cur) 911: elif cur.name == "functype": 912: count = count + analyzeAPIFunctype(cur) 913: else: 914: print "unexpected element %s in API doc <files>" % (cur.name) 915: cur = cur.next 916: return count 917: 918: def analyzeAPI(doc): 919: count = 0 920: if doc == None: 921: return -1 922: root = doc.getRootElement() 923: if root.name != "api": 924: print "Unexpected root name" 925: return -1 926: cur = root.children 927: while cur != None: 928: if cur.type == 'text': 929: cur = cur.next 930: continue 931: if cur.name == "files": 932: pass 933: # count = count + analyzeAPIFiles(cur) 934: elif cur.name == "symbols": 935: count = count + analyzeAPISymbols(cur) 936: else: 937: print "unexpected element %s in API doc" % (cur.name) 938: cur = cur.next 939: return count 940: 941: ######################################################################### 942: # # 943: # Web pages parsing and analysis # 944: # # 945: ######################################################################### 946: 947: import glob 948: 949: def analyzeHTMLText(doc, resource, p, section, id): 950: words = 0 951: try: 952: content = p.content 953: words = words + addStringHTML(content, resource, id, section, 5) 954: except: 955: return -1 956: return words 957: 958: def analyzeHTMLPara(doc, resource, p, section, id): 959: words = 0 960: try: 961: content = p.content 962: words = words + addStringHTML(content, resource, id, section, 5) 963: except: 964: return -1 965: return words 966: 967: def analyzeHTMLPre(doc, resource, p, section, id): 968: words = 0 969: try: 970: content = p.content 971: words = words + addStringHTML(content, resource, id, section, 5) 972: except: 973: return -1 974: return words 975: 976: def analyzeHTML(doc, resource, p, section, id): 977: words = 0 978: try: 979: content = p.content 980: words = words + addStringHTML(content, resource, id, section, 5) 981: except: 982: return -1 983: return words 984: 985: def analyzeHTML(doc, resource): 986: para = 0; 987: ctxt = doc.xpathNewContext() 988: try: 989: res = ctxt.xpathEval("//head/title") 990: title = res[0].content 991: except: 992: title = "Page %s" % (resource) 993: addPage(resource, title) 994: try: 995: items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()") 996: section = title 997: id = "" 998: for item in items: 999: if item.name == 'h1' or item.name == 'h2' or item.name == 'h3': 1000: section = item.content 1001: if item.prop("id"): 1002: id = item.prop("id") 1003: elif item.prop("name"): 1004: id = item.prop("name") 1005: elif item.type == 'text': 1006: analyzeHTMLText(doc, resource, item, section, id) 1007: para = para + 1 1008: elif item.name == 'p': 1009: analyzeHTMLPara(doc, resource, item, section, id) 1010: para = para + 1 1011: elif item.name == 'pre': 1012: analyzeHTMLPre(doc, resource, item, section, id) 1013: para = para + 1 1014: else: 1015: print "Page %s, unexpected %s element" % (resource, item.name) 1016: except: 1017: print "Page %s: problem analyzing" % (resource) 1018: print sys.exc_type, sys.exc_value 1019: 1020: return para 1021: 1022: def analyzeHTMLPages(): 1023: ret = 0 1024: HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html") 1025: for html in HTMLfiles: 1026: if html[0:3] == "API": 1027: continue 1028: if html == "xml.html": 1029: continue 1030: try: 1031: doc = libxml2.parseFile(html) 1032: except: 1033: doc = libxml2.htmlParseFile(html, None) 1034: try: 1035: res = analyzeHTML(doc, html) 1036: print "Parsed %s : %d paragraphs" % (html, res) 1037: ret = ret + 1 1038: except: 1039: print "could not parse %s" % (html) 1040: return ret 1041: 1042: ######################################################################### 1043: # # 1044: # Mail archives parsing and analysis # 1045: # # 1046: ######################################################################### 1047: 1048: import time 1049: 1050: def getXMLDateArchive(t = None): 1051: if t == None: 1052: t = time.time() 1053: T = time.gmtime(t) 1054: month = time.strftime("%B", T) 1055: year = T[0] 1056: url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month) 1057: return url 1058: 1059: def scanXMLMsgArchive(url, title, force = 0): 1060: if url == None or title == None: 1061: return 0 1062: 1063: ID = checkXMLMsgArchive(url) 1064: if force == 0 and ID != -1: 1065: return 0 1066: 1067: if ID == -1: 1068: ID = addXMLMsgArchive(url, title) 1069: if ID == -1: 1070: return 0 1071: 1072: try: 1073: print "Loading %s" % (url) 1074: doc = libxml2.htmlParseFile(url, None); 1075: except: 1076: doc = None 1077: if doc == None: 1078: print "Failed to parse %s" % (url) 1079: return 0 1080: 1081: addStringArchive(title, ID, 20) 1082: ctxt = doc.xpathNewContext() 1083: texts = ctxt.xpathEval("//pre//text()") 1084: for text in texts: 1085: addStringArchive(text.content, ID, 5) 1086: 1087: return 1 1088: 1089: def scanXMLDateArchive(t = None, force = 0): 1090: global wordsDictArchive 1091: 1092: wordsDictArchive = {} 1093: 1094: url = getXMLDateArchive(t) 1095: print "loading %s" % (url) 1096: try: 1097: doc = libxml2.htmlParseFile(url, None); 1098: except: 1099: doc = None 1100: if doc == None: 1101: print "Failed to parse %s" % (url) 1102: return -1 1103: ctxt = doc.xpathNewContext() 1104: anchors = ctxt.xpathEval("//a[@href]") 1105: links = 0 1106: newmsg = 0 1107: for anchor in anchors: 1108: href = anchor.prop("href") 1109: if href == None or href[0:3] != "msg": 1110: continue 1111: try: 1112: links = links + 1 1113: 1114: msg = libxml2.buildURI(href, url) 1115: title = anchor.content 1116: if title != None and title[0:4] == 'Re: ': 1117: title = title[4:] 1118: if title != None and title[0:6] == '[xml] ': 1119: title = title[6:] 1120: newmsg = newmsg + scanXMLMsgArchive(msg, title, force) 1121: 1122: except: 1123: pass 1124: 1125: return newmsg 1126: 1127: 1128: ######################################################################### 1129: # # 1130: # Main code: open the DB, the API XML and analyze it # 1131: # # 1132: ######################################################################### 1133: def analyzeArchives(t = None, force = 0): 1134: global wordsDictArchive 1135: 1136: ret = scanXMLDateArchive(t, force) 1137: print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret) 1138: 1139: i = 0 1140: skipped = 0 1141: for word in wordsDictArchive.keys(): 1142: refs = wordsDictArchive[word] 1143: if refs == None: 1144: skipped = skipped + 1 1145: continue; 1146: for id in refs.keys(): 1147: relevance = refs[id] 1148: updateWordArchive(word, id, relevance) 1149: i = i + 1 1150: 1151: print "Found %d associations in HTML pages" % (i) 1152: 1153: def analyzeHTMLTop(): 1154: global wordsDictHTML 1155: 1156: ret = analyzeHTMLPages() 1157: print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret) 1158: 1159: i = 0 1160: skipped = 0 1161: for word in wordsDictHTML.keys(): 1162: refs = wordsDictHTML[word] 1163: if refs == None: 1164: skipped = skipped + 1 1165: continue; 1166: for resource in refs.keys(): 1167: (relevance, id, section) = refs[resource] 1168: updateWordHTML(word, resource, section, id, relevance) 1169: i = i + 1 1170: 1171: print "Found %d associations in HTML pages" % (i) 1172: 1173: def analyzeAPITop(): 1174: global wordsDict 1175: global API 1176: 1177: try: 1178: doc = loadAPI(API) 1179: ret = analyzeAPI(doc) 1180: print "Analyzed %d blocs" % (ret) 1181: doc.freeDoc() 1182: except: 1183: print "Failed to parse and analyze %s" % (API) 1184: print sys.exc_type, sys.exc_value 1185: sys.exit(1) 1186: 1187: print "Indexed %d words" % (len(wordsDict)) 1188: i = 0 1189: skipped = 0 1190: for word in wordsDict.keys(): 1191: refs = wordsDict[word] 1192: if refs == None: 1193: skipped = skipped + 1 1194: continue; 1195: for (module, symbol) in refs.keys(): 1196: updateWord(word, symbol, refs[(module, symbol)]) 1197: i = i + 1 1198: 1199: print "Found %d associations, skipped %d words" % (i, skipped) 1200: 1201: def usage(): 1202: print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]" 1203: sys.exit(1) 1204: 1205: def main(): 1206: try: 1207: openMySQL() 1208: except: 1209: print "Failed to open the database" 1210: print sys.exc_type, sys.exc_value 1211: sys.exit(1) 1212: 1213: args = sys.argv[1:] 1214: force = 0 1215: if args: 1216: i = 0 1217: while i < len(args): 1218: if args[i] == '--force': 1219: force = 1 1220: elif args[i] == '--archive': 1221: analyzeArchives(None, force) 1222: elif args[i] == '--archive-year': 1223: i = i + 1; 1224: year = args[i] 1225: months = ["January" , "February", "March", "April", "May", 1226: "June", "July", "August", "September", "October", 1227: "November", "December"]; 1228: for month in months: 1229: try: 1230: str = "%s-%s" % (year, month) 1231: T = time.strptime(str, "%Y-%B") 1232: t = time.mktime(T) + 3600 * 24 * 10; 1233: analyzeArchives(t, force) 1234: except: 1235: print "Failed to index month archive:" 1236: print sys.exc_type, sys.exc_value 1237: elif args[i] == '--archive-month': 1238: i = i + 1; 1239: month = args[i] 1240: try: 1241: T = time.strptime(month, "%Y-%B") 1242: t = time.mktime(T) + 3600 * 24 * 10; 1243: analyzeArchives(t, force) 1244: except: 1245: print "Failed to index month archive:" 1246: print sys.exc_type, sys.exc_value 1247: elif args[i] == '--API': 1248: analyzeAPITop() 1249: elif args[i] == '--docs': 1250: analyzeHTMLTop() 1251: else: 1252: usage() 1253: i = i + 1 1254: else: 1255: usage() 1256: 1257: if __name__ == "__main__": 1258: main()