snippets/python/wiki2csv.py

#!/usr/bin/python3
# -*- tab-width: 2; indent-tabs-mode: t; -*-

# Copyright 2012 Jan Kanis
# License: GPL-3.0


# wiki2csv
#
# An explanation of this program is given in the accompanying README file.
# This program is maintained at http://www.bitbucket.org/JanKanis/wiki2csv/
# If you find any bugs, you can report them there.
# For command line options, see the help output of "wiki2csv.py --help".
# See http://en.wikipedia.org/wiki/Help:Wikitable for the wikitable syntax.


from collections import namedtuple
import sys, re, os.path, argparse, csv


Lexeme = namedtuple('Lexeme', 'type data raw')

# the different lexeme types in the wiki table syntax
class PreTable (object): # All text before the table starts gets this type
	pass
class TableStart (object):
	pass
class TableCaption (object):
	pass
class TableRow (object):
	pass
class TableHeader (object):
	pass
class TableHeaderSinglerow (TableHeader):
	pass
class TableHeaderContinued (TableHeader):
	pass
class TableData (object):
	pass
class TableDataSinglerow (TableData):
	pass
class TableDataContinued (TableData):
	pass
class TableEnd (object):
	pass

# what should happen for each type
actions = dict(
	# Store the item on a row of its own
	singlerow=(TableStart, TableCaption, TableEnd),
	# Store the data without the sytax marker
	data=(TableData,),
	# Store the full raw text
	raw=(TableHeader,)
)

# associations between wiki syntax and types
wikitypes = [
		('{|', TableStart),
		('|+', TableCaption),
		('|-', TableRow),
		('|}', TableEnd),
		('!', TableHeader),
		('|', TableData),
	]


# a generator that returns Lexemes. Input is a single string with a wikitable.
def wikitableparse(table):
	stable = table.split('\n')
	if not stable[-1]:
		del stable[-1]
	current = dict(type=PreTable, data='', raw='')

	for row in stable:
		srow = row.lstrip()
		for marker, type in wikitypes:
			if srow.startswith(marker):
				if current['type'] != PreTable:
					yield Lexeme(**current)
				current = dict(type=type, data=srow[len(marker):], raw=row)

				# process multiple cells on one line
				if current['type'] == TableData and '||' in current['data']:
					rows = current['raw'].split('||')
					yield Lexeme(type=TableDataSinglerow, data=rows[0].lstrip()[2:], raw=rows[0])
					for r in rows[1:-1]:
						yield Lexeme(type=TableDataContinued, data=r, raw='||'+r)
					current = dict(type=TableDataContinued, data=r, raw='||'+r)

				# same for multiple header cells on one line
				if current['type'] == TableHeader and '!!' in current['data']:
					rows = current['raw'].split('!!')
					yield Lexeme(type=TableHeaderSinglerow, data=rows[0].lstrip()[2:], raw=rows[0])
					for r in rows[1:-1]:
						yield Lexeme(type=TableHeaderContinued, data=r, raw='!!'+r)
					current = dict(type=TableHeaderContinued, data=r, raw='!!'+r)

				# Don't try to match again if we already hava a match
				break

		# continuation of previous lexeme on next line
		else:
			current['data'] += '\n' + row

	yield Lexeme(**current)


def wiki2csv(wikifile, csvfile):
	writer = csv.writer(csvfile)
	parser = wikitableparse(wikifile.read())
	row = []
	for lex in parser:
		if lex.type == TableRow:
			if row: writer.writerow(row)
			row = []
		elif lex.type in actions['singlerow']:
			if row: writer.writerow(row)
			writer.writerow([lex.raw])
			row = []
		elif lex.type in actions['data']:
			row.append(lex.data)
		elif lex.type in actions['raw']:
			row.append(lex.raw)
	if row:
		writer.writerow(row)


rawtypes = re.compile('|'.join((re.escape(marker) for marker, type in wikitypes
		if type in actions['raw'])))
singlerowtypes = re.compile('|'.join((re.escape(marker) for marker, type in wikitypes
		if type in actions['singlerow'])))

def parsecsv(csvfile):
	reader = csv.reader(csvfile)
	newrow = False
	for line in reader:
		for cell in line:
			if singlerowtypes.match(cell):
				yield cell
				break
			elif rawtypes.match(cell):
				yield cell
			elif len(cell) and cell[0] in '-+}':
				# Avoid a cornercase where a normal data cell has e.g. '-1' as content,
				# which would result in a new row marker
				yield '| '+cell
			else:
				yield '|'+cell
		if not singlerowtypes.match(cell):
			yield '|-'

def csv2wiki(csvfile, wikifile):
	for cell in parsecsv(csvfile):
		wikifile.write(cell+'\n')


def main():

	progname = os.path.basename(sys.argv[0])
	progname_cooked = os.path.splitext(progname)[0]

	# to show the correct help text
	towikidefault = tocsvdefault = ''
	if progname_cooked == 'csv2wiki':
		towikidefault = '(default for {}) '.format(progname)
		description = "Convert SOURCE containing a table CSV format to Mediawikis wikitable syntax in DEST. Do the reverse if --tocsv is given."
	else:
		tocsvdefault = '(default for {}) '.format(progname)
		description = "Convert SOURCE containing a table in Mediawikis wikitable syntax to Excel-readable CSV in DEST. Do the reverse if --towiki is given."

	# parse arguments
	parser = argparse.ArgumentParser(description=description)
	parser.add_argument('-v', '--verbose', action='store_true', help="be more verbose")

	direction = parser.add_mutually_exclusive_group()
	direction.add_argument('--tocsv', '-c', action='store_true',
		help=tocsvdefault+"Convert SOURCE from wikitable format to CSV in DEST")
	direction.add_argument('--towiki', '-w', action='store_true',
		help=towikidefault+"Convert SOURCE from CSV format back to wikitable format in DEST")

	parser.add_argument('source', metavar='SOURCE', type=argparse.FileType('r'), nargs='?', default=sys.stdin,
		help="The input file to read from. Omit or use '-' to read from stdin")
	parser.add_argument('dest', metavar='DEST', type=argparse.FileType('w'), nargs='?', default=sys.stdout,
		help="The file to write output to. Omit or use '-' to write to stdout")

	args = parser.parse_args()

	if args.towiki:
		direction = 'towiki'
	elif args.tocsv:
		direction = 'tocsv'
	elif progname_cooked == 'csv2wiki':
		direction = 'towiki'
	else:
		direction = 'tocsv'

	if args.verbose:
		print >>sys.stderr, 'direction=%s\n' % direction, 'source=%s\n' % args.source, 'dest=%s\n' % args.dest,

	if direction == 'towiki':
		csv2wiki(args.source, args.dest)
	else:
		wiki2csv(args.source, args.dest)

	if args.verbose:
		print >>sys.stderr, 'Conversion completed'


if __name__ == '__main__':
	main()