tools/autoindex.py

   1 """
   2 Add Sphinx index entries to RST source.
   3
   4 TODO: scan directory tree, look for *.rst
   5 TODO: add option to remove entries
   6 """
   7
   8 import os
   9 import re
  10 import sys
  11
  12 try:
  13     from ConfigParser import ConfigParser
  14 except ImportError:
  15     from configparser import ConfigParser
  16
  17
  18 # Configuration defaults.
  19 defaults = {'comment': ".. Generated by autoindex",
  20             'mintext': '5000',
  21             'noindex': ''}
  22
  23 # Read config settings.
  24 config = ConfigParser(defaults, allow_no_value=True)
  25 config.optionxform = str
  26
  27 thisdir = os.path.dirname(__file__)
  28 conffile = os.path.join(thisdir, "autoindex.cfg")
  29 config.read(conffile)
  30
  31 # Extract keywords and role mappings.
  32 def getmap(section):
  33     mapping = {}
  34
  35     if config.has_section(section):
  36         for name in config.options(section):
  37             if name not in defaults:
  38                 mapping[name] = config.get(section, name)
  39
  40     return mapping
  41
  42 keywords = getmap('keywords')
  43 rolemap = getmap('rolemap')
  44
  45 # Autoindex comment.
  46 comment = config.get('DEFAULT', 'comment')
  47
  48 # Minimum amount of text twixt identical entries.
  49 mintext = config.getint('DEFAULT', 'mintext')
  50
  51 # Don't add index entries after paragraphs matching this.
  52 noindex = config.get('DEFAULT', 'noindex').strip().split("\n")
  53
  54 if noindex:
  55     noindex_patterns = "(%s)" % "|".join(noindex)
  56 else:
  57     noindex_patterns = None
  58
  59 # Paragraph separator.
  60 separator = "\n\n"
  61
  62
  63 def main(args):
  64     # Parse command args.
  65     if len(args) == 2:
  66         infile = args[1]
  67         outfile = None
  68     elif len(args) == 3:
  69         infile, outfile = args[1:]
  70     else:
  71         sys.exit("Usage: %s INFILE [OUTFILE]" % args[0])
  72
  73     ##dump_paragraphs(infile)
  74
  75     # Do indexing.
  76     autoindex_file(infile, outfile)
  77
  78
  79 def autoindex_file(infile, outfile=None):
  80     "Add index entries to a file."
  81
  82     # Get original text.
  83     with open(infile) as fp:
  84         text = fp.read()
  85
  86     # Index it.
  87     itext = autoindex_text(text)
  88
  89     # Write output (but don't modify original if nothing changed).
  90     if outfile or itext != text:
  91         if outfile == '-':
  92             sys.stdout.write(itext)
  93         else:
  94             with open(outfile or infile, "wb") as fp:
  95                 fp.write(itext)
  96
  97
  98 def autoindex_text(text):
  99     "Add index entries to the given text."
 100     return separator.join(indexed_paragraphs(text))
 101
 102
 103 def indexed_paragraphs(text):
 104     "Yield indexed paragraphs from the specified text."
 105
 106     # Current text position.
 107     textpos = 0
 108
 109     # Text position of last entries for each index word (to avoid too many
 110     # close together for the same entry).
 111     lastpos = {}
 112
 113     def addindex(index, name, desc=None):
 114         if name not in lastpos or lastpos[name] + mintext < textpos:
 115             index.append((name, desc, textpos))
 116             lastpos[name] = textpos
 117
 118     # Whether to add index entries.
 119     noindex = False
 120
 121     for info in paragraph_info(text):
 122         # Update text count.
 123         para = info['text']
 124         textpos += len(para)
 125
 126         # Initialise index (list of [name, desc, textpos]).
 127         index = []
 128
 129         # Find index entries for roles.
 130         for match in re.finditer(r':(.+?):`(.+?)`', para):
 131             role, name = match.groups()
 132             if role in rolemap:
 133                 addindex(index, name, rolemap[role])
 134
 135         # Find index entries for keywords.
 136         paraline = para.replace("\n", " ")
 137         for word, desc in keywords.items():
 138             if re.search(r'\b' + word + r'\b', paraline):
 139                 addindex(index, word, desc)
 140
 141         # Yield index paragraph if required.
 142         if index and not noindex:
 143             indent = info['indent']
 144             lines = [indent + comment]
 145             lines.append(indent + ".. index::")
 146
 147             for name, desc, pos in sorted(index):
 148                 msg = "autoindex: " + name
 149
 150                 if desc:
 151                     text = "   pair: %s; %s" % (name, desc)
 152                     msg += " (" + desc + ")"
 153                 else:
 154                     text = "   single: %s" % name
 155
 156                 lines.append(indent + text)
 157                 sys.stderr.write("%s [%s]\n" % (msg, pos))
 158
 159             yield "\n".join(lines)
 160
 161         noindex = info['noindex']
 162
 163         # Yield paragraph.
 164         yield para
 165
 166
 167 def unindexed_paragraphs(text):
 168     "Yield paragraphs stripped of autoindex comments."
 169
 170     for para in text.split(separator):
 171         if comment not in para:
 172             yield para
 173
 174
 175 def paragraph_info(text):
 176     "Yield paragraph information from text."
 177
 178     noindex = False
 179     noindex_level = None
 180
 181     for para in unindexed_paragraphs(text):
 182         indent = re.match(r' *', para).group()
 183         level = len(indent)
 184
 185         # Detect first entry in a list.  Should be at same indent level as
 186         # its text.
 187         match = re.match(r'\* ', para.lstrip())
 188         if match:
 189             level += len(match.group())
 190
 191         if noindex_patterns:
 192             if not noindex and re.search(noindex_patterns, para, re.M):
 193                 noindex_level = level
 194                 noindex = True
 195             elif noindex_level is not None and level <= noindex_level:
 196                 noindex_level = None
 197                 noindex = False
 198
 199         yield {'text': para,
 200                'noindex': noindex,
 201                'indent': indent,
 202                'level': level}
 203
 204
 205 def dump_paragraphs(infile):
 206     print noindex_patterns
 207
 208     with open(infile) as fp:
 209         text = fp.read()
 210
 211     for info in paragraph_info(text):
 212         print info['level'], info['noindex'], info['text'].replace("\n", " ")
 213
 214
 215 if __name__ == "__main__":
 216     main(sys.argv)