snippets/python/wiki2csv.py

#!/usr/bin/python3
# -*- tab-width: 2; indent-tabs-mode: t; -*-

# Copyright 2012 Jan Kanis
# License: GPL-3.0


# wiki2csv
#
# An explanation of this program is given in the accompanying README file.
# This program is maintained at http://www.bitbucket.org/JanKanis/wiki2csv/
# If you find any bugs, you can report them there.
# For command line options, see the help output of "wiki2csv.py --help".
# See http://en.wikipedia.org/wiki/Help:Wikitable for the wikitable syntax.


from collections import namedtuple
import sys, re, os.path, argparse, csv


Lexeme = namedtuple('Lexeme', 'type data raw')

# the different lexeme types in the wiki table syntax
class PreTable (object): # All text before the table starts gets this type
	pass
class TableStart (object):
	pass
class TableCaption (object):
	pass
class TableRow (object):
	pass
class TableHeader (object):
	pass
class TableHeaderSinglerow (TableHeader):
	pass
class TableHeaderContinued (TableHeader):
	pass
class TableData (object):
	pass
class TableDataSinglerow (TableData):
	pass
class TableDataContinued (TableData):
	pass
class TableEnd (object):
	pass

# what should happen for each type
actions = dict(
	# Store the item on a row of its own
	singlerow=(TableStart, TableCaption, TableEnd),
	# Store the data without the sytax marker
	data=(TableData,),
	# Store the full raw text
	raw=(TableHeader,)
)

# associations between wiki syntax and types
wikitypes = [
		('{|', TableStart),
		('|+', TableCaption),
		('|-', TableRow),
		('|}', TableEnd),
		('!', TableHeader),
		('|', TableData),
	]


# a generator that returns Lexemes. Input is a single string with a wikitable.
def wikitableparse(table):
	stable = table.split('\n')
	if not stable[-1]:
		del stable[-1]
	current = dict(type=PreTable, data='', raw='')

	for row in stable:
		srow = row.lstrip()
		for marker, type in wikitypes:
			if srow.startswith(marker):
				if current['type'] != PreTable:
					yield Lexeme(**current)
				current = dict(type=type, data=srow[len(marker):], raw=row)

				# process multiple cells on one line
				if current['type'] == TableData and '||' in current['data']:
					rows = current['raw'].split('||')
					yield Lexeme(type=TableDataSinglerow, data=rows[0].lstrip()[2:], raw=rows[0])
					for r in rows[1:-1]:
						yield Lexeme(type=TableDataContinued, data=r, raw='||'+r)
					current = dict(type=TableDataContinued, data=r, raw='||'+r)

				# same for multiple header cells on one line
				if current['type'] == TableHeader and '!!' in current['data']:
					rows = current['raw'].split('!!')
					yield Lexeme(type=TableHeaderSinglerow, data=rows[0].lstrip()[2:], raw=rows[0])
					for r in rows[1:-1]:
						yield Lexeme(type=TableHeaderContinued, data=r, raw='!!'+r)
					current = dict(type=TableHeaderContinued, data=r, raw='!!'+r)

				# Don't try to match again if we already hava a match
				break

		# continuation of previous lexeme on next line
		else:
			current['data'] += '\n' + row

	yield Lexeme(**current)


def wiki2csv(wikifile, csvfile):
	writer = csv.writer(csvfile)
	parser = wikitableparse(wikifile.read())
	row = []
	for lex in parser:
		if lex.type == TableRow:
			if row: writer.writerow(row)
			row = []
		elif lex.type in actions['singlerow']:
			if row: writer.writerow(row)
			writer.writerow([lex.raw])
			row = []
		elif lex.type in actions['data']:
			row.append(lex.data)
		elif lex.type in actions['raw']:
			row.append(lex.raw)
	if row:
		writer.writerow(row)


rawtypes = re.compile('|'.join((re.escape(marker) for marker, type in wikitypes
		if type in actions['raw'])))
singlerowtypes = re.compile('|'.join((re.escape(marker) for marker, type in wikitypes
		if type in actions['singlerow'])))

def parsecsv(csvfile):
	reader = csv.reader(csvfile)
	newrow = False
	for line in reader:
		for cell in line:
			if singlerowtypes.match(cell):
				yield cell
				break
			elif rawtypes.match(cell):
				yield cell
			elif len(cell) and cell[0] in '-+}':
				# Avoid a cornercase where a normal data cell has e.g. '-1' as content,
				# which would result in a new row marker
				yield '| '+cell
			else:
				yield '|'+cell
		if not singlerowtypes.match(cell):
			yield '|-'

def csv2wiki(csvfile, wikifile):
	for cell in parsecsv(csvfile):
		wikifile.write(cell+'\n')


def main():

	progname = os.path.basename(sys.argv[0])
	progname_cooked = os.path.splitext(progname)[0]

	# to show the correct help text
	towikidefault = tocsvdefault = ''
	if progname_cooked == 'csv2wiki':
		towikidefault = '(default for {}) '.format(progname)
		description = "Convert SOURCE containing a table CSV format to Mediawikis wikitable syntax in DEST. Do the reverse if --tocsv is given."
	else:
		tocsvdefault = '(default for {}) '.format(progname)
		description = "Convert SOURCE containing a table in Mediawikis wikitable syntax to Excel-readable CSV in DEST. Do the reverse if --towiki is given."

	# parse arguments
	parser = argparse.ArgumentParser(description=description)
	parser.add_argument('-v', '--verbose', action='store_true', help="be more verbose")

	direction = parser.add_mutually_exclusive_group()
	direction.add_argument('--tocsv', '-c', action='store_true',
		help=tocsvdefault+"Convert SOURCE from wikitable format to CSV in DEST")
	direction.add_argument('--towiki', '-w', action='store_true',
		help=towikidefault+"Convert SOURCE from CSV format back to wikitable format in DEST")

	parser.add_argument('source', metavar='SOURCE', type=argparse.FileType('r'), nargs='?', default=sys.stdin,
		help="The input file to read from. Omit or use '-' to read from stdin")
	parser.add_argument('dest', metavar='DEST', type=argparse.FileType('w'), nargs='?', default=sys.stdout,
		help="The file to write output to. Omit or use '-' to write to stdout")

	args = parser.parse_args()

	if args.towiki:
		direction = 'towiki'
	elif args.tocsv:
		direction = 'tocsv'
	elif progname_cooked == 'csv2wiki':
		direction = 'towiki'
	else:
		direction = 'tocsv'

	if args.verbose:
		print >>sys.stderr, 'direction=%s\n' % direction, 'source=%s\n' % args.source, 'dest=%s\n' % args.dest,

	if direction == 'towiki':
		csv2wiki(args.source, args.dest)
	else:
		wiki2csv(args.source, args.dest)

	if args.verbose:
		print >>sys.stderr, 'Conversion completed'


if __name__ == '__main__':
	main()
add script to convert wiki tables to csv 2022-08-20 00:04:31 +02:00			`#!/usr/bin/python3`
			`# -- tab-width: 2; indent-tabs-mode: t; --`

			`# Copyright 2012 Jan Kanis`
			`# License: GPL-3.0`


			`# wiki2csv`
			`#`
			`# An explanation of this program is given in the accompanying README file.`
			`# This program is maintained at http://www.bitbucket.org/JanKanis/wiki2csv/`
			`# If you find any bugs, you can report them there.`
			`# For command line options, see the help output of "wiki2csv.py --help".`
			`# See http://en.wikipedia.org/wiki/Help:Wikitable for the wikitable syntax.`


			`from collections import namedtuple`
			`import sys, re, os.path, argparse, csv`


			`Lexeme = namedtuple('Lexeme', 'type data raw')`

			`# the different lexeme types in the wiki table syntax`
			`class PreTable (object): # All text before the table starts gets this type`
			`pass`
			`class TableStart (object):`
			`pass`
			`class TableCaption (object):`
			`pass`
			`class TableRow (object):`
			`pass`
			`class TableHeader (object):`
			`pass`
			`class TableHeaderSinglerow (TableHeader):`
			`pass`
			`class TableHeaderContinued (TableHeader):`
			`pass`
			`class TableData (object):`
			`pass`
			`class TableDataSinglerow (TableData):`
			`pass`
			`class TableDataContinued (TableData):`
			`pass`
			`class TableEnd (object):`
			`pass`

			`# what should happen for each type`
			`actions = dict(`
			`# Store the item on a row of its own`
			`singlerow=(TableStart, TableCaption, TableEnd),`
			`# Store the data without the sytax marker`
			`data=(TableData,),`
			`# Store the full raw text`
			`raw=(TableHeader,)`
			`)`

			`# associations between wiki syntax and types`
			`wikitypes = [`
			`('{\|', TableStart),`
			`('\|+', TableCaption),`
			`('\|-', TableRow),`
			`('\|}', TableEnd),`
			`('!', TableHeader),`
			`('\|', TableData),`
			`]`


			`# a generator that returns Lexemes. Input is a single string with a wikitable.`
			`def wikitableparse(table):`
			`stable = table.split('\n')`
			`if not stable[-1]:`
			`del stable[-1]`
			`current = dict(type=PreTable, data='', raw='')`

			`for row in stable:`
			`srow = row.lstrip()`
			`for marker, type in wikitypes:`
			`if srow.startswith(marker):`
			`if current['type'] != PreTable:`
			`yield Lexeme(**current)`
			`current = dict(type=type, data=srow[len(marker):], raw=row)`

			`# process multiple cells on one line`
			`if current['type'] == TableData and '\|\|' in current['data']:`
			`rows = current['raw'].split('\|\|')`
			`yield Lexeme(type=TableDataSinglerow, data=rows[0].lstrip()[2:], raw=rows[0])`
			`for r in rows[1:-1]:`
			`yield Lexeme(type=TableDataContinued, data=r, raw='\|\|'+r)`
			`current = dict(type=TableDataContinued, data=r, raw='\|\|'+r)`

			`# same for multiple header cells on one line`
			`if current['type'] == TableHeader and '!!' in current['data']:`
			`rows = current['raw'].split('!!')`
			`yield Lexeme(type=TableHeaderSinglerow, data=rows[0].lstrip()[2:], raw=rows[0])`
			`for r in rows[1:-1]:`
			`yield Lexeme(type=TableHeaderContinued, data=r, raw='!!'+r)`
			`current = dict(type=TableHeaderContinued, data=r, raw='!!'+r)`

			`# Don't try to match again if we already hava a match`
			`break`

			`# continuation of previous lexeme on next line`
			`else:`
			`current['data'] += '\n' + row`

			`yield Lexeme(**current)`


			`def wiki2csv(wikifile, csvfile):`
			`writer = csv.writer(csvfile)`
			`parser = wikitableparse(wikifile.read())`
			`row = []`
			`for lex in parser:`
			`if lex.type == TableRow:`
			`if row: writer.writerow(row)`
			`row = []`
			`elif lex.type in actions['singlerow']:`
			`if row: writer.writerow(row)`
			`writer.writerow([lex.raw])`
			`row = []`
			`elif lex.type in actions['data']:`
			`row.append(lex.data)`
			`elif lex.type in actions['raw']:`
			`row.append(lex.raw)`
			`if row:`
			`writer.writerow(row)`


			`rawtypes = re.compile('\|'.join((re.escape(marker) for marker, type in wikitypes`
			`if type in actions['raw'])))`
			`singlerowtypes = re.compile('\|'.join((re.escape(marker) for marker, type in wikitypes`
			`if type in actions['singlerow'])))`

			`def parsecsv(csvfile):`
			`reader = csv.reader(csvfile)`
			`newrow = False`
			`for line in reader:`
			`for cell in line:`
			`if singlerowtypes.match(cell):`
			`yield cell`
			`break`
			`elif rawtypes.match(cell):`
			`yield cell`
			`elif len(cell) and cell[0] in '-+}':`
			`# Avoid a cornercase where a normal data cell has e.g. '-1' as content,`
			`# which would result in a new row marker`
			`yield '\| '+cell`
			`else:`
			`yield '\|'+cell`
			`if not singlerowtypes.match(cell):`
			`yield '\|-'`

			`def csv2wiki(csvfile, wikifile):`
			`for cell in parsecsv(csvfile):`
			`wikifile.write(cell+'\n')`


			`def main():`

			`progname = os.path.basename(sys.argv[0])`
			`progname_cooked = os.path.splitext(progname)[0]`

			`# to show the correct help text`
			`towikidefault = tocsvdefault = ''`
			`if progname_cooked == 'csv2wiki':`
			`towikidefault = '(default for {}) '.format(progname)`
			`description = "Convert SOURCE containing a table CSV format to Mediawikis wikitable syntax in DEST. Do the reverse if --tocsv is given."`
			`else:`
			`tocsvdefault = '(default for {}) '.format(progname)`
			`description = "Convert SOURCE containing a table in Mediawikis wikitable syntax to Excel-readable CSV in DEST. Do the reverse if --towiki is given."`

			`# parse arguments`
			`parser = argparse.ArgumentParser(description=description)`
			`parser.add_argument('-v', '--verbose', action='store_true', help="be more verbose")`

			`direction = parser.add_mutually_exclusive_group()`
			`direction.add_argument('--tocsv', '-c', action='store_true',`
			`help=tocsvdefault+"Convert SOURCE from wikitable format to CSV in DEST")`
			`direction.add_argument('--towiki', '-w', action='store_true',`
			`help=towikidefault+"Convert SOURCE from CSV format back to wikitable format in DEST")`

			`parser.add_argument('source', metavar='SOURCE', type=argparse.FileType('r'), nargs='?', default=sys.stdin,`
			`help="The input file to read from. Omit or use '-' to read from stdin")`
			`parser.add_argument('dest', metavar='DEST', type=argparse.FileType('w'), nargs='?', default=sys.stdout,`
			`help="The file to write output to. Omit or use '-' to write to stdout")`

			`args = parser.parse_args()`

			`if args.towiki:`
			`direction = 'towiki'`
			`elif args.tocsv:`
			`direction = 'tocsv'`
			`elif progname_cooked == 'csv2wiki':`
			`direction = 'towiki'`
			`else:`
			`direction = 'tocsv'`

			`if args.verbose:`
			`print >>sys.stderr, 'direction=%s\n' % direction, 'source=%s\n' % args.source, 'dest=%s\n' % args.dest,`

			`if direction == 'towiki':`
			`csv2wiki(args.source, args.dest)`
			`else:`
			`wiki2csv(args.source, args.dest)`

			`if args.verbose:`
			`print >>sys.stderr, 'Conversion completed'`


			`if __name__ == '__main__':`
			`main()`