Annotation of embedaddon/libxml2/doc/index.py, revision 1.1.1.1
1.1 misho 1: #!/usr/bin/python -u
2: #
3: # imports the API description and fills up a database with
4: # name relevance to modules, functions or web pages
5: #
6: # Operation needed:
7: # =================
8: #
9: # install mysqld, the python wrappers for mysql and libxml2, start mysqld
10: # Change the root passwd of mysql:
11: # mysqladmin -u root password new_password
12: # Create the new database xmlsoft
13: # mysqladmin -p create xmlsoft
14: # Create a database user 'veillard' and give him passord access
15: # change veillard and abcde with the right user name and passwd
16: # mysql -p
17: # password:
18: # mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
19: # IDENTIFIED BY 'abcde' WITH GRANT OPTION;
20: #
21: # As the user check the access:
22: # mysql -p xmlsoft
23: # Enter password:
24: # Welcome to the MySQL monitor....
25: # mysql> use xmlsoft
26: # Database changed
27: # mysql> quit
28: # Bye
29: #
30: # Then run the script in the doc subdir, it will create the symbols and
31: # word tables and populate them with informations extracted from
32: # the libxml2-api.xml API description, and make then accessible read-only
33: # by nobody@loaclhost the user expected to be Apache's one
34: #
35: # On the Apache configuration, make sure you have php support enabled
36: #
37:
38: import MySQLdb
39: import libxml2
40: import sys
41: import string
42: import os
43:
44: #
45: # We are not interested in parsing errors here
46: #
47: def callback(ctx, str):
48: return
49: libxml2.registerErrorHandler(callback, None)
50:
51: #
52: # The dictionnary of tables required and the SQL command needed
53: # to create them
54: #
55: TABLES={
56: "symbols" : """CREATE TABLE symbols (
57: name varchar(255) BINARY NOT NULL,
58: module varchar(255) BINARY NOT NULL,
59: type varchar(25) NOT NULL,
60: descr varchar(255),
61: UNIQUE KEY name (name),
62: KEY module (module))""",
63: "words" : """CREATE TABLE words (
64: name varchar(50) BINARY NOT NULL,
65: symbol varchar(255) BINARY NOT NULL,
66: relevance int,
67: KEY name (name),
68: KEY symbol (symbol),
69: UNIQUE KEY ID (name, symbol))""",
70: "wordsHTML" : """CREATE TABLE wordsHTML (
71: name varchar(50) BINARY NOT NULL,
72: resource varchar(255) BINARY NOT NULL,
73: section varchar(255),
74: id varchar(50),
75: relevance int,
76: KEY name (name),
77: KEY resource (resource),
78: UNIQUE KEY ref (name, resource))""",
79: "wordsArchive" : """CREATE TABLE wordsArchive (
80: name varchar(50) BINARY NOT NULL,
81: ID int(11) NOT NULL,
82: relevance int,
83: KEY name (name),
84: UNIQUE KEY ref (name, ID))""",
85: "pages" : """CREATE TABLE pages (
86: resource varchar(255) BINARY NOT NULL,
87: title varchar(255) BINARY NOT NULL,
88: UNIQUE KEY name (resource))""",
89: "archives" : """CREATE TABLE archives (
90: ID int(11) NOT NULL auto_increment,
91: resource varchar(255) BINARY NOT NULL,
92: title varchar(255) BINARY NOT NULL,
93: UNIQUE KEY id (ID,resource(255)),
94: INDEX (ID),
95: INDEX (resource))""",
96: "Queries" : """CREATE TABLE Queries (
97: ID int(11) NOT NULL auto_increment,
98: Value varchar(50) NOT NULL,
99: Count int(11) NOT NULL,
100: UNIQUE KEY id (ID,Value(35)),
101: INDEX (ID))""",
102: "AllQueries" : """CREATE TABLE AllQueries (
103: ID int(11) NOT NULL auto_increment,
104: Value varchar(50) NOT NULL,
105: Count int(11) NOT NULL,
106: UNIQUE KEY id (ID,Value(35)),
107: INDEX (ID))""",
108: }
109:
110: #
111: # The XML API description file to parse
112: #
113: API="libxml2-api.xml"
114: DB=None
115:
116: #########################################################################
117: # #
118: # MySQL database interfaces #
119: # #
120: #########################################################################
121: def createTable(db, name):
122: global TABLES
123:
124: if db == None:
125: return -1
126: if name == None:
127: return -1
128: c = db.cursor()
129:
130: ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
131: if ret == 1:
132: print "Removed table %s" % (name)
133: print "Creating table %s" % (name)
134: try:
135: ret = c.execute(TABLES[name])
136: except:
137: print "Failed to create table %s" % (name)
138: return -1
139: return ret
140:
141: def checkTables(db, verbose = 1):
142: global TABLES
143:
144: if db == None:
145: return -1
146: c = db.cursor()
147: nbtables = c.execute("show tables")
148: if verbose:
149: print "Found %d tables" % (nbtables)
150: tables = {}
151: i = 0
152: while i < nbtables:
153: l = c.fetchone()
154: name = l[0]
155: tables[name] = {}
156: i = i + 1
157:
158: for table in TABLES.keys():
159: if not tables.has_key(table):
160: print "table %s missing" % (table)
161: createTable(db, table)
162: try:
163: ret = c.execute("SELECT count(*) from %s" % table);
164: row = c.fetchone()
165: if verbose:
166: print "Table %s contains %d records" % (table, row[0])
167: except:
168: print "Troubles with table %s : repairing" % (table)
169: ret = c.execute("repair table %s" % table);
170: print "repairing returned %d" % (ret)
171: ret = c.execute("SELECT count(*) from %s" % table);
172: row = c.fetchone()
173: print "Table %s contains %d records" % (table, row[0])
174: if verbose:
175: print "checkTables finished"
176:
177: # make sure apache can access the tables read-only
178: try:
179: ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
180: ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost")
181: except:
182: pass
183: return 0
184:
185: def openMySQL(db="xmlsoft", passwd=None, verbose = 1):
186: global DB
187:
188: if passwd == None:
189: try:
190: passwd = os.environ["MySQL_PASS"]
191: except:
192: print "No password available, set environment MySQL_PASS"
193: sys.exit(1)
194:
195: DB = MySQLdb.connect(passwd=passwd, db=db)
196: if DB == None:
197: return -1
198: ret = checkTables(DB, verbose)
199: return ret
200:
201: def updateWord(name, symbol, relevance):
202: global DB
203:
204: if DB == None:
205: openMySQL()
206: if DB == None:
207: return -1
208: if name == None:
209: return -1
210: if symbol == None:
211: return -1
212:
213: c = DB.cursor()
214: try:
215: ret = c.execute(
216: """INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
217: (name, symbol, relevance))
218: except:
219: try:
220: ret = c.execute(
221: """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
222: (relevance, name, symbol))
223: except:
224: print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
225: print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
226: print sys.exc_type, sys.exc_value
227: return -1
228:
229: return ret
230:
231: def updateSymbol(name, module, type, desc):
232: global DB
233:
234: updateWord(name, name, 50)
235: if DB == None:
236: openMySQL()
237: if DB == None:
238: return -1
239: if name == None:
240: return -1
241: if module == None:
242: return -1
243: if type == None:
244: return -1
245:
246: try:
247: desc = string.replace(desc, "'", " ")
248: l = string.split(desc, ".")
249: desc = l[0]
250: desc = desc[0:99]
251: except:
252: desc = ""
253:
254: c = DB.cursor()
255: try:
256: ret = c.execute(
257: """INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
258: (name, module, type, desc))
259: except:
260: try:
261: ret = c.execute(
262: """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
263: (module, type, desc, name))
264: except:
265: print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
266: print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
267: print sys.exc_type, sys.exc_value
268: return -1
269:
270: return ret
271:
272: def addFunction(name, module, desc = ""):
273: return updateSymbol(name, module, 'function', desc)
274:
275: def addMacro(name, module, desc = ""):
276: return updateSymbol(name, module, 'macro', desc)
277:
278: def addEnum(name, module, desc = ""):
279: return updateSymbol(name, module, 'enum', desc)
280:
281: def addStruct(name, module, desc = ""):
282: return updateSymbol(name, module, 'struct', desc)
283:
284: def addConst(name, module, desc = ""):
285: return updateSymbol(name, module, 'const', desc)
286:
287: def addType(name, module, desc = ""):
288: return updateSymbol(name, module, 'type', desc)
289:
290: def addFunctype(name, module, desc = ""):
291: return updateSymbol(name, module, 'functype', desc)
292:
293: def addPage(resource, title):
294: global DB
295:
296: if DB == None:
297: openMySQL()
298: if DB == None:
299: return -1
300: if resource == None:
301: return -1
302:
303: c = DB.cursor()
304: try:
305: ret = c.execute(
306: """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
307: (resource, title))
308: except:
309: try:
310: ret = c.execute(
311: """UPDATE pages SET title='%s' WHERE resource='%s'""" %
312: (title, resource))
313: except:
314: print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
315: print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
316: print sys.exc_type, sys.exc_value
317: return -1
318:
319: return ret
320:
321: def updateWordHTML(name, resource, desc, id, relevance):
322: global DB
323:
324: if DB == None:
325: openMySQL()
326: if DB == None:
327: return -1
328: if name == None:
329: return -1
330: if resource == None:
331: return -1
332: if id == None:
333: id = ""
334: if desc == None:
335: desc = ""
336: else:
337: try:
338: desc = string.replace(desc, "'", " ")
339: desc = desc[0:99]
340: except:
341: desc = ""
342:
343: c = DB.cursor()
344: try:
345: ret = c.execute(
346: """INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
347: (name, resource, desc, id, relevance))
348: except:
349: try:
350: ret = c.execute(
351: """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
352: (desc, id, relevance, name, resource))
353: except:
354: print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
355: print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
356: print sys.exc_type, sys.exc_value
357: return -1
358:
359: return ret
360:
361: def checkXMLMsgArchive(url):
362: global DB
363:
364: if DB == None:
365: openMySQL()
366: if DB == None:
367: return -1
368: if url == None:
369: return -1
370:
371: c = DB.cursor()
372: try:
373: ret = c.execute(
374: """SELECT ID FROM archives WHERE resource='%s'""" % (url))
375: row = c.fetchone()
376: if row == None:
377: return -1
378: except:
379: return -1
380:
381: return row[0]
382:
383: def addXMLMsgArchive(url, title):
384: global DB
385:
386: if DB == None:
387: openMySQL()
388: if DB == None:
389: return -1
390: if url == None:
391: return -1
392: if title == None:
393: title = ""
394: else:
395: title = string.replace(title, "'", " ")
396: title = title[0:99]
397:
398: c = DB.cursor()
399: try:
400: cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
401: ret = c.execute(cmd)
402: cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
403: ret = c.execute(cmd)
404: row = c.fetchone()
405: if row == None:
406: print "addXMLMsgArchive failed to get the ID: %s" % (url)
407: return -1
408: except:
409: print "addXMLMsgArchive failed command: %s" % (cmd)
410: return -1
411:
412: return((int)(row[0]))
413:
414: def updateWordArchive(name, id, relevance):
415: global DB
416:
417: if DB == None:
418: openMySQL()
419: if DB == None:
420: return -1
421: if name == None:
422: return -1
423: if id == None:
424: return -1
425:
426: c = DB.cursor()
427: try:
428: ret = c.execute(
429: """INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
430: (name, id, relevance))
431: except:
432: try:
433: ret = c.execute(
434: """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
435: (relevance, name, id))
436: except:
437: print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
438: print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
439: print sys.exc_type, sys.exc_value
440: return -1
441:
442: return ret
443:
444: #########################################################################
445: # #
446: # Word dictionnary and analysis routines #
447: # #
448: #########################################################################
449:
450: #
451: # top 100 english word without the one len < 3 + own set
452: #
453: dropWords = {
454: 'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
455: 'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
456: 'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
457: 'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
458: 'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
459: 'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
460: 'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
461: 'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
462: 'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
463: 'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
464: 'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
465: 'down':0,
466: 'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
467: }
468:
469: wordsDict = {}
470: wordsDictHTML = {}
471: wordsDictArchive = {}
472:
473: def cleanupWordsString(str):
474: str = string.replace(str, ".", " ")
475: str = string.replace(str, "!", " ")
476: str = string.replace(str, "?", " ")
477: str = string.replace(str, ",", " ")
478: str = string.replace(str, "'", " ")
479: str = string.replace(str, '"', " ")
480: str = string.replace(str, ";", " ")
481: str = string.replace(str, "(", " ")
482: str = string.replace(str, ")", " ")
483: str = string.replace(str, "{", " ")
484: str = string.replace(str, "}", " ")
485: str = string.replace(str, "<", " ")
486: str = string.replace(str, ">", " ")
487: str = string.replace(str, "=", " ")
488: str = string.replace(str, "/", " ")
489: str = string.replace(str, "*", " ")
490: str = string.replace(str, ":", " ")
491: str = string.replace(str, "#", " ")
492: str = string.replace(str, "\\", " ")
493: str = string.replace(str, "\n", " ")
494: str = string.replace(str, "\r", " ")
495: str = string.replace(str, "\xc2", " ")
496: str = string.replace(str, "\xa0", " ")
497: return str
498:
499: def cleanupDescrString(str):
500: str = string.replace(str, "'", " ")
501: str = string.replace(str, "\n", " ")
502: str = string.replace(str, "\r", " ")
503: str = string.replace(str, "\xc2", " ")
504: str = string.replace(str, "\xa0", " ")
505: l = string.split(str)
506: str = string.join(str)
507: return str
508:
509: def splitIdentifier(str):
510: ret = []
511: while str != "":
512: cur = string.lower(str[0])
513: str = str[1:]
514: if ((cur < 'a') or (cur > 'z')):
515: continue
516: while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
517: cur = cur + string.lower(str[0])
518: str = str[1:]
519: while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
520: cur = cur + str[0]
521: str = str[1:]
522: while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
523: str = str[1:]
524: ret.append(cur)
525: return ret
526:
527: def addWord(word, module, symbol, relevance):
528: global wordsDict
529:
530: if word == None or len(word) < 3:
531: return -1
532: if module == None or symbol == None:
533: return -1
534: if dropWords.has_key(word):
535: return 0
536: if ord(word[0]) > 0x80:
537: return 0
538:
539: if wordsDict.has_key(word):
540: d = wordsDict[word]
541: if d == None:
542: return 0
543: if len(d) > 500:
544: wordsDict[word] = None
545: return 0
546: try:
547: relevance = relevance + d[(module, symbol)]
548: except:
549: pass
550: else:
551: wordsDict[word] = {}
552: wordsDict[word][(module, symbol)] = relevance
553: return relevance
554:
555: def addString(str, module, symbol, relevance):
556: if str == None or len(str) < 3:
557: return -1
558: ret = 0
559: str = cleanupWordsString(str)
560: l = string.split(str)
561: for word in l:
562: if len(word) > 2:
563: ret = ret + addWord(word, module, symbol, 5)
564:
565: return ret
566:
567: def addWordHTML(word, resource, id, section, relevance):
568: global wordsDictHTML
569:
570: if word == None or len(word) < 3:
571: return -1
572: if resource == None or section == None:
573: return -1
574: if dropWords.has_key(word):
575: return 0
576: if ord(word[0]) > 0x80:
577: return 0
578:
579: section = cleanupDescrString(section)
580:
581: if wordsDictHTML.has_key(word):
582: d = wordsDictHTML[word]
583: if d == None:
584: print "skipped %s" % (word)
585: return 0
586: try:
587: (r,i,s) = d[resource]
588: if i != None:
589: id = i
590: if s != None:
591: section = s
592: relevance = relevance + r
593: except:
594: pass
595: else:
596: wordsDictHTML[word] = {}
597: d = wordsDictHTML[word];
598: d[resource] = (relevance, id, section)
599: return relevance
600:
601: def addStringHTML(str, resource, id, section, relevance):
602: if str == None or len(str) < 3:
603: return -1
604: ret = 0
605: str = cleanupWordsString(str)
606: l = string.split(str)
607: for word in l:
608: if len(word) > 2:
609: try:
610: r = addWordHTML(word, resource, id, section, relevance)
611: if r < 0:
612: print "addWordHTML failed: %s %s" % (word, resource)
613: ret = ret + r
614: except:
615: print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
616: print sys.exc_type, sys.exc_value
617:
618: return ret
619:
620: def addWordArchive(word, id, relevance):
621: global wordsDictArchive
622:
623: if word == None or len(word) < 3:
624: return -1
625: if id == None or id == -1:
626: return -1
627: if dropWords.has_key(word):
628: return 0
629: if ord(word[0]) > 0x80:
630: return 0
631:
632: if wordsDictArchive.has_key(word):
633: d = wordsDictArchive[word]
634: if d == None:
635: print "skipped %s" % (word)
636: return 0
637: try:
638: r = d[id]
639: relevance = relevance + r
640: except:
641: pass
642: else:
643: wordsDictArchive[word] = {}
644: d = wordsDictArchive[word];
645: d[id] = relevance
646: return relevance
647:
648: def addStringArchive(str, id, relevance):
649: if str == None or len(str) < 3:
650: return -1
651: ret = 0
652: str = cleanupWordsString(str)
653: l = string.split(str)
654: for word in l:
655: i = len(word)
656: if i > 2:
657: try:
658: r = addWordArchive(word, id, relevance)
659: if r < 0:
660: print "addWordArchive failed: %s %s" % (word, id)
661: else:
662: ret = ret + r
663: except:
664: print "addWordArchive failed: %s %s %d" % (word, id, relevance)
665: print sys.exc_type, sys.exc_value
666: return ret
667:
668: #########################################################################
669: # #
670: # XML API description analysis #
671: # #
672: #########################################################################
673:
674: def loadAPI(filename):
675: doc = libxml2.parseFile(filename)
676: print "loaded %s" % (filename)
677: return doc
678:
679: def foundExport(file, symbol):
680: if file == None:
681: return 0
682: if symbol == None:
683: return 0
684: addFunction(symbol, file)
685: l = splitIdentifier(symbol)
686: for word in l:
687: addWord(word, file, symbol, 10)
688: return 1
689:
690: def analyzeAPIFile(top):
691: count = 0
692: name = top.prop("name")
693: cur = top.children
694: while cur != None:
695: if cur.type == 'text':
696: cur = cur.next
697: continue
698: if cur.name == "exports":
699: count = count + foundExport(name, cur.prop("symbol"))
700: else:
701: print "unexpected element %s in API doc <file name='%s'>" % (name)
702: cur = cur.next
703: return count
704:
705: def analyzeAPIFiles(top):
706: count = 0
707: cur = top.children
708:
709: while cur != None:
710: if cur.type == 'text':
711: cur = cur.next
712: continue
713: if cur.name == "file":
714: count = count + analyzeAPIFile(cur)
715: else:
716: print "unexpected element %s in API doc <files>" % (cur.name)
717: cur = cur.next
718: return count
719:
720: def analyzeAPIEnum(top):
721: file = top.prop("file")
722: if file == None:
723: return 0
724: symbol = top.prop("name")
725: if symbol == None:
726: return 0
727:
728: addEnum(symbol, file)
729: l = splitIdentifier(symbol)
730: for word in l:
731: addWord(word, file, symbol, 10)
732:
733: return 1
734:
735: def analyzeAPIConst(top):
736: file = top.prop("file")
737: if file == None:
738: return 0
739: symbol = top.prop("name")
740: if symbol == None:
741: return 0
742:
743: addConst(symbol, file)
744: l = splitIdentifier(symbol)
745: for word in l:
746: addWord(word, file, symbol, 10)
747:
748: return 1
749:
750: def analyzeAPIType(top):
751: file = top.prop("file")
752: if file == None:
753: return 0
754: symbol = top.prop("name")
755: if symbol == None:
756: return 0
757:
758: addType(symbol, file)
759: l = splitIdentifier(symbol)
760: for word in l:
761: addWord(word, file, symbol, 10)
762: return 1
763:
764: def analyzeAPIFunctype(top):
765: file = top.prop("file")
766: if file == None:
767: return 0
768: symbol = top.prop("name")
769: if symbol == None:
770: return 0
771:
772: addFunctype(symbol, file)
773: l = splitIdentifier(symbol)
774: for word in l:
775: addWord(word, file, symbol, 10)
776: return 1
777:
778: def analyzeAPIStruct(top):
779: file = top.prop("file")
780: if file == None:
781: return 0
782: symbol = top.prop("name")
783: if symbol == None:
784: return 0
785:
786: addStruct(symbol, file)
787: l = splitIdentifier(symbol)
788: for word in l:
789: addWord(word, file, symbol, 10)
790:
791: info = top.prop("info")
792: if info != None:
793: info = string.replace(info, "'", " ")
794: info = string.strip(info)
795: l = string.split(info)
796: for word in l:
797: if len(word) > 2:
798: addWord(word, file, symbol, 5)
799: return 1
800:
801: def analyzeAPIMacro(top):
802: file = top.prop("file")
803: if file == None:
804: return 0
805: symbol = top.prop("name")
806: if symbol == None:
807: return 0
808: symbol = string.replace(symbol, "'", " ")
809: symbol = string.strip(symbol)
810:
811: info = None
812: cur = top.children
813: while cur != None:
814: if cur.type == 'text':
815: cur = cur.next
816: continue
817: if cur.name == "info":
818: info = cur.content
819: break
820: cur = cur.next
821:
822: l = splitIdentifier(symbol)
823: for word in l:
824: addWord(word, file, symbol, 10)
825:
826: if info == None:
827: addMacro(symbol, file)
828: print "Macro %s description has no <info>" % (symbol)
829: return 0
830:
831: info = string.replace(info, "'", " ")
832: info = string.strip(info)
833: addMacro(symbol, file, info)
834: l = string.split(info)
835: for word in l:
836: if len(word) > 2:
837: addWord(word, file, symbol, 5)
838: return 1
839:
840: def analyzeAPIFunction(top):
841: file = top.prop("file")
842: if file == None:
843: return 0
844: symbol = top.prop("name")
845: if symbol == None:
846: return 0
847:
848: symbol = string.replace(symbol, "'", " ")
849: symbol = string.strip(symbol)
850: info = None
851: cur = top.children
852: while cur != None:
853: if cur.type == 'text':
854: cur = cur.next
855: continue
856: if cur.name == "info":
857: info = cur.content
858: elif cur.name == "return":
859: rinfo = cur.prop("info")
860: if rinfo != None:
861: rinfo = string.replace(rinfo, "'", " ")
862: rinfo = string.strip(rinfo)
863: addString(rinfo, file, symbol, 7)
864: elif cur.name == "arg":
865: ainfo = cur.prop("info")
866: if ainfo != None:
867: ainfo = string.replace(ainfo, "'", " ")
868: ainfo = string.strip(ainfo)
869: addString(ainfo, file, symbol, 5)
870: name = cur.prop("name")
871: if name != None:
872: name = string.replace(name, "'", " ")
873: name = string.strip(name)
874: addWord(name, file, symbol, 7)
875: cur = cur.next
876: if info == None:
877: print "Function %s description has no <info>" % (symbol)
878: addFunction(symbol, file, "")
879: else:
880: info = string.replace(info, "'", " ")
881: info = string.strip(info)
882: addFunction(symbol, file, info)
883: addString(info, file, symbol, 5)
884:
885: l = splitIdentifier(symbol)
886: for word in l:
887: addWord(word, file, symbol, 10)
888:
889: return 1
890:
891: def analyzeAPISymbols(top):
892: count = 0
893: cur = top.children
894:
895: while cur != None:
896: if cur.type == 'text':
897: cur = cur.next
898: continue
899: if cur.name == "macro":
900: count = count + analyzeAPIMacro(cur)
901: elif cur.name == "function":
902: count = count + analyzeAPIFunction(cur)
903: elif cur.name == "const":
904: count = count + analyzeAPIConst(cur)
905: elif cur.name == "typedef":
906: count = count + analyzeAPIType(cur)
907: elif cur.name == "struct":
908: count = count + analyzeAPIStruct(cur)
909: elif cur.name == "enum":
910: count = count + analyzeAPIEnum(cur)
911: elif cur.name == "functype":
912: count = count + analyzeAPIFunctype(cur)
913: else:
914: print "unexpected element %s in API doc <files>" % (cur.name)
915: cur = cur.next
916: return count
917:
918: def analyzeAPI(doc):
919: count = 0
920: if doc == None:
921: return -1
922: root = doc.getRootElement()
923: if root.name != "api":
924: print "Unexpected root name"
925: return -1
926: cur = root.children
927: while cur != None:
928: if cur.type == 'text':
929: cur = cur.next
930: continue
931: if cur.name == "files":
932: pass
933: # count = count + analyzeAPIFiles(cur)
934: elif cur.name == "symbols":
935: count = count + analyzeAPISymbols(cur)
936: else:
937: print "unexpected element %s in API doc" % (cur.name)
938: cur = cur.next
939: return count
940:
941: #########################################################################
942: # #
943: # Web pages parsing and analysis #
944: # #
945: #########################################################################
946:
947: import glob
948:
949: def analyzeHTMLText(doc, resource, p, section, id):
950: words = 0
951: try:
952: content = p.content
953: words = words + addStringHTML(content, resource, id, section, 5)
954: except:
955: return -1
956: return words
957:
958: def analyzeHTMLPara(doc, resource, p, section, id):
959: words = 0
960: try:
961: content = p.content
962: words = words + addStringHTML(content, resource, id, section, 5)
963: except:
964: return -1
965: return words
966:
967: def analyzeHTMLPre(doc, resource, p, section, id):
968: words = 0
969: try:
970: content = p.content
971: words = words + addStringHTML(content, resource, id, section, 5)
972: except:
973: return -1
974: return words
975:
976: def analyzeHTML(doc, resource, p, section, id):
977: words = 0
978: try:
979: content = p.content
980: words = words + addStringHTML(content, resource, id, section, 5)
981: except:
982: return -1
983: return words
984:
985: def analyzeHTML(doc, resource):
986: para = 0;
987: ctxt = doc.xpathNewContext()
988: try:
989: res = ctxt.xpathEval("//head/title")
990: title = res[0].content
991: except:
992: title = "Page %s" % (resource)
993: addPage(resource, title)
994: try:
995: items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
996: section = title
997: id = ""
998: for item in items:
999: if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
1000: section = item.content
1001: if item.prop("id"):
1002: id = item.prop("id")
1003: elif item.prop("name"):
1004: id = item.prop("name")
1005: elif item.type == 'text':
1006: analyzeHTMLText(doc, resource, item, section, id)
1007: para = para + 1
1008: elif item.name == 'p':
1009: analyzeHTMLPara(doc, resource, item, section, id)
1010: para = para + 1
1011: elif item.name == 'pre':
1012: analyzeHTMLPre(doc, resource, item, section, id)
1013: para = para + 1
1014: else:
1015: print "Page %s, unexpected %s element" % (resource, item.name)
1016: except:
1017: print "Page %s: problem analyzing" % (resource)
1018: print sys.exc_type, sys.exc_value
1019:
1020: return para
1021:
1022: def analyzeHTMLPages():
1023: ret = 0
1024: HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
1025: for html in HTMLfiles:
1026: if html[0:3] == "API":
1027: continue
1028: if html == "xml.html":
1029: continue
1030: try:
1031: doc = libxml2.parseFile(html)
1032: except:
1033: doc = libxml2.htmlParseFile(html, None)
1034: try:
1035: res = analyzeHTML(doc, html)
1036: print "Parsed %s : %d paragraphs" % (html, res)
1037: ret = ret + 1
1038: except:
1039: print "could not parse %s" % (html)
1040: return ret
1041:
1042: #########################################################################
1043: # #
1044: # Mail archives parsing and analysis #
1045: # #
1046: #########################################################################
1047:
1048: import time
1049:
1050: def getXMLDateArchive(t = None):
1051: if t == None:
1052: t = time.time()
1053: T = time.gmtime(t)
1054: month = time.strftime("%B", T)
1055: year = T[0]
1056: url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
1057: return url
1058:
1059: def scanXMLMsgArchive(url, title, force = 0):
1060: if url == None or title == None:
1061: return 0
1062:
1063: ID = checkXMLMsgArchive(url)
1064: if force == 0 and ID != -1:
1065: return 0
1066:
1067: if ID == -1:
1068: ID = addXMLMsgArchive(url, title)
1069: if ID == -1:
1070: return 0
1071:
1072: try:
1073: print "Loading %s" % (url)
1074: doc = libxml2.htmlParseFile(url, None);
1075: except:
1076: doc = None
1077: if doc == None:
1078: print "Failed to parse %s" % (url)
1079: return 0
1080:
1081: addStringArchive(title, ID, 20)
1082: ctxt = doc.xpathNewContext()
1083: texts = ctxt.xpathEval("//pre//text()")
1084: for text in texts:
1085: addStringArchive(text.content, ID, 5)
1086:
1087: return 1
1088:
1089: def scanXMLDateArchive(t = None, force = 0):
1090: global wordsDictArchive
1091:
1092: wordsDictArchive = {}
1093:
1094: url = getXMLDateArchive(t)
1095: print "loading %s" % (url)
1096: try:
1097: doc = libxml2.htmlParseFile(url, None);
1098: except:
1099: doc = None
1100: if doc == None:
1101: print "Failed to parse %s" % (url)
1102: return -1
1103: ctxt = doc.xpathNewContext()
1104: anchors = ctxt.xpathEval("//a[@href]")
1105: links = 0
1106: newmsg = 0
1107: for anchor in anchors:
1108: href = anchor.prop("href")
1109: if href == None or href[0:3] != "msg":
1110: continue
1111: try:
1112: links = links + 1
1113:
1114: msg = libxml2.buildURI(href, url)
1115: title = anchor.content
1116: if title != None and title[0:4] == 'Re: ':
1117: title = title[4:]
1118: if title != None and title[0:6] == '[xml] ':
1119: title = title[6:]
1120: newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
1121:
1122: except:
1123: pass
1124:
1125: return newmsg
1126:
1127:
1128: #########################################################################
1129: # #
1130: # Main code: open the DB, the API XML and analyze it #
1131: # #
1132: #########################################################################
1133: def analyzeArchives(t = None, force = 0):
1134: global wordsDictArchive
1135:
1136: ret = scanXMLDateArchive(t, force)
1137: print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
1138:
1139: i = 0
1140: skipped = 0
1141: for word in wordsDictArchive.keys():
1142: refs = wordsDictArchive[word]
1143: if refs == None:
1144: skipped = skipped + 1
1145: continue;
1146: for id in refs.keys():
1147: relevance = refs[id]
1148: updateWordArchive(word, id, relevance)
1149: i = i + 1
1150:
1151: print "Found %d associations in HTML pages" % (i)
1152:
1153: def analyzeHTMLTop():
1154: global wordsDictHTML
1155:
1156: ret = analyzeHTMLPages()
1157: print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
1158:
1159: i = 0
1160: skipped = 0
1161: for word in wordsDictHTML.keys():
1162: refs = wordsDictHTML[word]
1163: if refs == None:
1164: skipped = skipped + 1
1165: continue;
1166: for resource in refs.keys():
1167: (relevance, id, section) = refs[resource]
1168: updateWordHTML(word, resource, section, id, relevance)
1169: i = i + 1
1170:
1171: print "Found %d associations in HTML pages" % (i)
1172:
1173: def analyzeAPITop():
1174: global wordsDict
1175: global API
1176:
1177: try:
1178: doc = loadAPI(API)
1179: ret = analyzeAPI(doc)
1180: print "Analyzed %d blocs" % (ret)
1181: doc.freeDoc()
1182: except:
1183: print "Failed to parse and analyze %s" % (API)
1184: print sys.exc_type, sys.exc_value
1185: sys.exit(1)
1186:
1187: print "Indexed %d words" % (len(wordsDict))
1188: i = 0
1189: skipped = 0
1190: for word in wordsDict.keys():
1191: refs = wordsDict[word]
1192: if refs == None:
1193: skipped = skipped + 1
1194: continue;
1195: for (module, symbol) in refs.keys():
1196: updateWord(word, symbol, refs[(module, symbol)])
1197: i = i + 1
1198:
1199: print "Found %d associations, skipped %d words" % (i, skipped)
1200:
1201: def usage():
1202: print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]"
1203: sys.exit(1)
1204:
1205: def main():
1206: try:
1207: openMySQL()
1208: except:
1209: print "Failed to open the database"
1210: print sys.exc_type, sys.exc_value
1211: sys.exit(1)
1212:
1213: args = sys.argv[1:]
1214: force = 0
1215: if args:
1216: i = 0
1217: while i < len(args):
1218: if args[i] == '--force':
1219: force = 1
1220: elif args[i] == '--archive':
1221: analyzeArchives(None, force)
1222: elif args[i] == '--archive-year':
1223: i = i + 1;
1224: year = args[i]
1225: months = ["January" , "February", "March", "April", "May",
1226: "June", "July", "August", "September", "October",
1227: "November", "December"];
1228: for month in months:
1229: try:
1230: str = "%s-%s" % (year, month)
1231: T = time.strptime(str, "%Y-%B")
1232: t = time.mktime(T) + 3600 * 24 * 10;
1233: analyzeArchives(t, force)
1234: except:
1235: print "Failed to index month archive:"
1236: print sys.exc_type, sys.exc_value
1237: elif args[i] == '--archive-month':
1238: i = i + 1;
1239: month = args[i]
1240: try:
1241: T = time.strptime(month, "%Y-%B")
1242: t = time.mktime(T) + 3600 * 24 * 10;
1243: analyzeArchives(t, force)
1244: except:
1245: print "Failed to index month archive:"
1246: print sys.exc_type, sys.exc_value
1247: elif args[i] == '--API':
1248: analyzeAPITop()
1249: elif args[i] == '--docs':
1250: analyzeHTMLTop()
1251: else:
1252: usage()
1253: i = i + 1
1254: else:
1255: usage()
1256:
1257: if __name__ == "__main__":
1258: main()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>