#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Build a book6 index"""
# Version: 2022-12-26 - original
# Version: 2022-12-27 - cosmetic improvements
# Version: 2022-12-28 - allow keywords starting with digit
# Version: 2022-12-30 - include timestamp
# Version: 2023-01-02 - added alt text to logo
# Version: 2023-01-03 - added Contents link at end of index
# Version: 2023-01-05 - added citation index
# Version: 2023-01-10 - added warnings of invalid internal citations
# Version: 2023-05-20 - exclude RFC bibliography from indexing
########################################################
# Copyright (C) 2022-2023 Brian E. Carpenter.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with
# or without modification, are permitted provided that the
# following conditions are met:
#
# 1. Redistributions of source code must retain the above
# copyright notice, this list of conditions and the following
# disclaimer.
#
# 2. Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials
# provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of
# its contributors may be used to endorse or promote products
# derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS
# AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
# THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
########################################################
from tkinter import Tk
from tkinter.filedialog import askdirectory
from tkinter.messagebox import askokcancel, askyesno, showinfo
import time
import os
def logit(msg):
"""Add a message to the log file"""
global flog, printing
flog.write(msg+"\n")
if printing:
print(msg)
def logitw(msg):
"""Add a warning message to the log file"""
global warnings
logit("WARNING: "+msg)
warnings += 1
def dprint(*msg):
""" Diagnostic print """
global printing
if printing:
print(*msg)
def crash(msg):
"""Log and crash"""
printing = True
logit("CRASH "+msg)
flog.close()
exit()
def rf(f):
"""Return a file as a list of lower case strings"""
file = open(f, "r",encoding='utf-8', errors='replace')
l = file.readlines()
file.close()
return l
def file_ok(fn):
"""Check if a local file is OK"""
if fn.startswith("../"):
fn = fn.replace("../","")
fn = fn.replace("%20"," ")
return os.path.exists(fn)
def wf(f,l):
"""Write list of strings to file"""
global written
file = open(f, "w",encoding='utf-8')
for line in l:
file.write(line)
file.close()
logit("'"+f+"' written")
def uncase(l):
"""Return lower case version of a list of strings"""
u = []
for s in l:
u.append(s.lower())
return u
specials = ('-','$') #characters allowed in index terms (as well as letters and digits)
ignores = ('and', 'the', 'but', 'for', 'not', 'are', 'all', 'new', 'was', 'can', 'may', 'one',
'two', 'has', 'out', 'use', 'any', 'see', 'now', 'get', 'how', 'its', 'top', 'had',
'that', 'then')
def good(word):
"""Is the word useful?"""
if word in ignores:
return False
return len(word)>2
def exwords(target):
"""Return list of words in a target list of strings"""
words = []
for line in target:
line = line.lower()
if line.strip().startswith(''):
continue #best effort to remove XML comments
if line.startswith('## [') or line.startswith('### ['):
continue #remove section links
inspace = False
inword = False
#hack to combine multi-word terms
for item in split_terms:
if item in line:
line = line.replace(item, item.replace(' ','$'))
for c in ' '+line: #scan characters
if c == ' ':
inspace = True
if not inword:
continue
else:
#end of word
if good(word):
words.append(word)
inword = False
elif c.islower() or c.isdigit() or c in specials:
inspace = False
if inword:
word += c
else:
word = c
inword = True
else: #neither space nor letter nor digit nor special
if inword: #treat as end of word
if good(word):
words.append(word)
inword = False
return words
def indexable(word):
"""if word is indexable, return indexing term"""
for i in range(len(index_terms)):
terms = index_terms[i].split(' ')
for term in terms:
if word == term.lower():
return index_terms[i]
return None
def packx(index):
"""Sort and pack an index"""
index = sorted(index, key=str.casefold)
same = None
for i in range(len(index)):
head = index[i].split("]")[0]
if head != same:
index[i-1] += "\n"
index[i] = index[i].replace(head, head+' '+blob)
same = head
else:
index[i] = index[i].replace(head, '['+blob)
return index
link_warn = "\n"
blob = '●'
######### Startup
#Define some globals
printing = False # True for extra diagnostic prints
warnings = 0
#Announce
Tk().withdraw() # we don't want a full GUI
T = "Book index maker."
printing = askyesno(title=T,
message = "Diagnostic printing?")
where = askdirectory(title = "Select main book directory")
os.chdir(where)
#Open log file
flog = open("indexBook.log", "w",encoding='utf-8')
timestamp = time.strftime("%Y-%m-%d %H:%M:%S UTC%z",time.localtime())
logit("indexBook run at "+timestamp)
logit("Running in directory "+ os.getcwd())
showinfo(title=T,
message = "Will read indexing terms and current book6 text.\nTouch no files until done!")
######### Read indexing terms
raw_terms = rf("utilities/index6.txt")
index_terms = []
split_terms = []
for t in raw_terms:
if t.strip() and not t.strip().startswith("#"):
while "'" in t:
#hack to combine multi-word terms
head, split_term, tail = t.split("'", maxsplit=2)
split_terms.append(split_term.lower())
t = head+split_term.replace(" ","$")+tail
index_terms.append(t.strip())
dprint("Index terms:", index_terms)
dprint("Split terms:", split_terms)
######### Create empty index and citation index
index = []
citex = []
######### Scan all .md files in subdirectories
for path, subdirs, files in os.walk('.'):
path = path.replace('\\','/')
if path.startswith('./') and path[2].isdigit():
dprint("Processing directory", path)
for fn in files:
if fn.endswith('.md') and not "RFC bibliography" in fn:
dprint("Processing file", fn)
target = rf(path+'/'+fn)
#first scan for indexable words and citations
words = exwords(target)
url = (path+'/'+fn).replace(' ','%20')
for w in words:
wx = indexable(w)
if wx:
##dprint(wx, '===', w)
term = wx.split()[0].replace('$',' ')
cite = '['+term+']('+url+')\n'
if not cite in index:
index.append(cite)
if w.startswith('rfc') or w.startswith('bcp') or w.startswith('std'):
if len(w) < 8 and not w.isalpha():
cite = '['+w.upper()+']('+url+')\n'
if not cite in citex:
citex.append(cite)
#now scan for invalid internal citations
#and report errors
for line in target:
#look for internal link pattern like
#[2. Addresses](../2.%20IPv6%20Basic%20Technology/Addresses.md)
if "](TBD)" in line:
logitw("Undefined TBD reference in '"+path[2:]+'/'+fn+"'.")
while "](../" in line:
_, line = line.split("](../", maxsplit=1)
f_cited, line = line.split(")", maxsplit=1)
if not file_ok(f_cited):
logitw("Invalid reference to '"+f_cited.replace("%20", " ")+"' in '"+path[2:]+'/'+fn+"'.")
index = packx(index)
index.insert(0,'# book6 Main Index\n')
index.insert(1,'
\n\n')
index.insert(2,"Generated at "+timestamp+"\n\n")
index.insert(3,"This index was created automatically, so it's dumb. ")
index.insert(4,"It is not case-sensitive. ")
index.insert(5,'It has links to each section that mentions each keyword.\n')
index.insert(6,link_warn)
index.append("\n### [Back to main Contents](Contents.md)")
wf("Index.md", index)
citex = packx(citex)
citex.insert(0,'# book6 Citation Index\n')
citex.insert(1,'
\n\n')
citex.insert(2,"Generated at "+timestamp+"\n\n")
citex.insert(3,"This index was created automatically, so it's dumb. ")
citex.insert(4,'It has links to each section that mentions each citation.\n')
citex.insert(5,link_warn)
citex.append("\n### [Back to main Contents](Contents.md)")
wf("Citex.md", citex)
######### Close log and exit
flog.close()
if warnings:
warn = str(warnings)+" warning(s)\n"
else:
warn = ""
showinfo(title=T,
message = warn+"Check indexBook.log.")