add script to convert wiki tables to csv

2022-08-20 00:04:31 +02:00 · 2022-08-20 00:04:31 +02:00 · 6a828f4a52
commit 6a828f4a52
parent 7f231c2e5b
1 changed files with 211 additions and 0 deletions
--- a/python/wiki2csv.py
+++ b/python/wiki2csv.py
@ -0,0 +1,211 @@
+#!/usr/bin/python3
+# -*- tab-width: 2; indent-tabs-mode: t; -*-
+
+# Copyright 2012 Jan Kanis
+# License: GPL-3.0
+
+
+# wiki2csv
+#
+# An explanation of this program is given in the accompanying README file.
+# This program is maintained at http://www.bitbucket.org/JanKanis/wiki2csv/
+# If you find any bugs, you can report them there.
+# For command line options, see the help output of "wiki2csv.py --help".
+# See http://en.wikipedia.org/wiki/Help:Wikitable for the wikitable syntax.
+
+
+from collections import namedtuple
+import sys, re, os.path, argparse, csv
+
+
+Lexeme = namedtuple('Lexeme', 'type data raw')
+
+# the different lexeme types in the wiki table syntax
+class PreTable (object): # All text before the table starts gets this type
+	pass
+class TableStart (object):
+	pass
+class TableCaption (object):
+	pass
+class TableRow (object):
+	pass
+class TableHeader (object):
+	pass
+class TableHeaderSinglerow (TableHeader):
+	pass
+class TableHeaderContinued (TableHeader):
+	pass
+class TableData (object):
+	pass
+class TableDataSinglerow (TableData):
+	pass
+class TableDataContinued (TableData):
+	pass
+class TableEnd (object):
+	pass
+
+# what should happen for each type
+actions = dict(
+	# Store the item on a row of its own
+	singlerow=(TableStart, TableCaption, TableEnd),
+	# Store the data without the sytax marker
+	data=(TableData,),
+	# Store the full raw text
+	raw=(TableHeader,)
+)
+
+# associations between wiki syntax and types
+wikitypes = [
+		('{|', TableStart),
+		('|+', TableCaption),
+		('|-', TableRow),
+		('|}', TableEnd),
+		('!', TableHeader),
+		('|', TableData),
+	]
+
+
+# a generator that returns Lexemes. Input is a single string with a wikitable.
+def wikitableparse(table):
+	stable = table.split('\n')
+	if not stable[-1]:
+		del stable[-1]
+	current = dict(type=PreTable, data='', raw='')
+
+	for row in stable:
+		srow = row.lstrip()
+		for marker, type in wikitypes:
+			if srow.startswith(marker):
+				if current['type'] != PreTable:
+					yield Lexeme(**current)
+				current = dict(type=type, data=srow[len(marker):], raw=row)
+
+				# process multiple cells on one line
+				if current['type'] == TableData and '||' in current['data']:
+					rows = current['raw'].split('||')
+					yield Lexeme(type=TableDataSinglerow, data=rows[0].lstrip()[2:], raw=rows[0])
+					for r in rows[1:-1]:
+						yield Lexeme(type=TableDataContinued, data=r, raw='||'+r)
+					current = dict(type=TableDataContinued, data=r, raw='||'+r)
+
+				# same for multiple header cells on one line
+				if current['type'] == TableHeader and '!!' in current['data']:
+					rows = current['raw'].split('!!')
+					yield Lexeme(type=TableHeaderSinglerow, data=rows[0].lstrip()[2:], raw=rows[0])
+					for r in rows[1:-1]:
+						yield Lexeme(type=TableHeaderContinued, data=r, raw='!!'+r)
+					current = dict(type=TableHeaderContinued, data=r, raw='!!'+r)
+
+				# Don't try to match again if we already hava a match
+				break
+
+		# continuation of previous lexeme on next line
+		else:
+			current['data'] += '\n' + row
+
+	yield Lexeme(**current)
+
+
+def wiki2csv(wikifile, csvfile):
+	writer = csv.writer(csvfile)
+	parser = wikitableparse(wikifile.read())
+	row = []
+	for lex in parser:
+		if lex.type == TableRow:
+			if row: writer.writerow(row)
+			row = []
+		elif lex.type in actions['singlerow']:
+			if row: writer.writerow(row)
+			writer.writerow([lex.raw])
+			row = []
+		elif lex.type in actions['data']:
+			row.append(lex.data)
+		elif lex.type in actions['raw']:
+			row.append(lex.raw)
+	if row:
+		writer.writerow(row)
+
+
+rawtypes = re.compile('|'.join((re.escape(marker) for marker, type in wikitypes
+		if type in actions['raw'])))
+singlerowtypes = re.compile('|'.join((re.escape(marker) for marker, type in wikitypes
+		if type in actions['singlerow'])))
+
+def parsecsv(csvfile):
+	reader = csv.reader(csvfile)
+	newrow = False
+	for line in reader:
+		for cell in line:
+			if singlerowtypes.match(cell):
+				yield cell
+				break
+			elif rawtypes.match(cell):
+				yield cell
+			elif len(cell) and cell[0] in '-+}':
+				# Avoid a cornercase where a normal data cell has e.g. '-1' as content,
+				# which would result in a new row marker
+				yield '| '+cell
+			else:
+				yield '|'+cell
+		if not singlerowtypes.match(cell):
+			yield '|-'
+
+def csv2wiki(csvfile, wikifile):
+	for cell in parsecsv(csvfile):
+		wikifile.write(cell+'\n')
+
+
+def main():
+
+	progname = os.path.basename(sys.argv[0])
+	progname_cooked = os.path.splitext(progname)[0]
+
+	# to show the correct help text
+	towikidefault = tocsvdefault = ''
+	if progname_cooked == 'csv2wiki':
+		towikidefault = '(default for {}) '.format(progname)
+		description = "Convert SOURCE containing a table CSV format to Mediawikis wikitable syntax in DEST. Do the reverse if --tocsv is given."
+	else:
+		tocsvdefault = '(default for {}) '.format(progname)
+		description = "Convert SOURCE containing a table in Mediawikis wikitable syntax to Excel-readable CSV in DEST. Do the reverse if --towiki is given."
+
+	# parse arguments
+	parser = argparse.ArgumentParser(description=description)
+	parser.add_argument('-v', '--verbose', action='store_true', help="be more verbose")
+
+	direction = parser.add_mutually_exclusive_group()
+	direction.add_argument('--tocsv', '-c', action='store_true',
+		help=tocsvdefault+"Convert SOURCE from wikitable format to CSV in DEST")
+	direction.add_argument('--towiki', '-w', action='store_true',
+		help=towikidefault+"Convert SOURCE from CSV format back to wikitable format in DEST")
+
+	parser.add_argument('source', metavar='SOURCE', type=argparse.FileType('r'), nargs='?', default=sys.stdin,
+		help="The input file to read from. Omit or use '-' to read from stdin")
+	parser.add_argument('dest', metavar='DEST', type=argparse.FileType('w'), nargs='?', default=sys.stdout,
+		help="The file to write output to. Omit or use '-' to write to stdout")
+
+	args = parser.parse_args()
+
+	if args.towiki:
+		direction = 'towiki'
+	elif args.tocsv:
+		direction = 'tocsv'
+	elif progname_cooked == 'csv2wiki':
+		direction = 'towiki'
+	else:
+		direction = 'tocsv'
+
+	if args.verbose:
+		print >>sys.stderr, 'direction=%s\n' % direction, 'source=%s\n' % args.source, 'dest=%s\n' % args.dest,
+
+	if direction == 'towiki':
+		csv2wiki(args.source, args.dest)
+	else:
+		wiki2csv(args.source, args.dest)
+
+	if args.verbose:
+		print >>sys.stderr, 'Conversion completed'
+
+
+if __name__ == '__main__':
+	main()