211 lines
6.1 KiB
Python
211 lines
6.1 KiB
Python
#!/usr/bin/python3
|
|
# -*- tab-width: 2; indent-tabs-mode: t; -*-
|
|
|
|
# Copyright 2012 Jan Kanis
|
|
# License: GPL-3.0
|
|
|
|
|
|
# wiki2csv
|
|
#
|
|
# An explanation of this program is given in the accompanying README file.
|
|
# This program is maintained at http://www.bitbucket.org/JanKanis/wiki2csv/
|
|
# If you find any bugs, you can report them there.
|
|
# For command line options, see the help output of "wiki2csv.py --help".
|
|
# See http://en.wikipedia.org/wiki/Help:Wikitable for the wikitable syntax.
|
|
|
|
|
|
from collections import namedtuple
|
|
import sys, re, os.path, argparse, csv
|
|
|
|
|
|
Lexeme = namedtuple('Lexeme', 'type data raw')
|
|
|
|
# the different lexeme types in the wiki table syntax
|
|
class PreTable (object): # All text before the table starts gets this type
|
|
pass
|
|
class TableStart (object):
|
|
pass
|
|
class TableCaption (object):
|
|
pass
|
|
class TableRow (object):
|
|
pass
|
|
class TableHeader (object):
|
|
pass
|
|
class TableHeaderSinglerow (TableHeader):
|
|
pass
|
|
class TableHeaderContinued (TableHeader):
|
|
pass
|
|
class TableData (object):
|
|
pass
|
|
class TableDataSinglerow (TableData):
|
|
pass
|
|
class TableDataContinued (TableData):
|
|
pass
|
|
class TableEnd (object):
|
|
pass
|
|
|
|
# what should happen for each type
|
|
actions = dict(
|
|
# Store the item on a row of its own
|
|
singlerow=(TableStart, TableCaption, TableEnd),
|
|
# Store the data without the sytax marker
|
|
data=(TableData,),
|
|
# Store the full raw text
|
|
raw=(TableHeader,)
|
|
)
|
|
|
|
# associations between wiki syntax and types
|
|
wikitypes = [
|
|
('{|', TableStart),
|
|
('|+', TableCaption),
|
|
('|-', TableRow),
|
|
('|}', TableEnd),
|
|
('!', TableHeader),
|
|
('|', TableData),
|
|
]
|
|
|
|
|
|
# a generator that returns Lexemes. Input is a single string with a wikitable.
|
|
def wikitableparse(table):
|
|
stable = table.split('\n')
|
|
if not stable[-1]:
|
|
del stable[-1]
|
|
current = dict(type=PreTable, data='', raw='')
|
|
|
|
for row in stable:
|
|
srow = row.lstrip()
|
|
for marker, type in wikitypes:
|
|
if srow.startswith(marker):
|
|
if current['type'] != PreTable:
|
|
yield Lexeme(**current)
|
|
current = dict(type=type, data=srow[len(marker):], raw=row)
|
|
|
|
# process multiple cells on one line
|
|
if current['type'] == TableData and '||' in current['data']:
|
|
rows = current['raw'].split('||')
|
|
yield Lexeme(type=TableDataSinglerow, data=rows[0].lstrip()[2:], raw=rows[0])
|
|
for r in rows[1:-1]:
|
|
yield Lexeme(type=TableDataContinued, data=r, raw='||'+r)
|
|
current = dict(type=TableDataContinued, data=r, raw='||'+r)
|
|
|
|
# same for multiple header cells on one line
|
|
if current['type'] == TableHeader and '!!' in current['data']:
|
|
rows = current['raw'].split('!!')
|
|
yield Lexeme(type=TableHeaderSinglerow, data=rows[0].lstrip()[2:], raw=rows[0])
|
|
for r in rows[1:-1]:
|
|
yield Lexeme(type=TableHeaderContinued, data=r, raw='!!'+r)
|
|
current = dict(type=TableHeaderContinued, data=r, raw='!!'+r)
|
|
|
|
# Don't try to match again if we already hava a match
|
|
break
|
|
|
|
# continuation of previous lexeme on next line
|
|
else:
|
|
current['data'] += '\n' + row
|
|
|
|
yield Lexeme(**current)
|
|
|
|
|
|
def wiki2csv(wikifile, csvfile):
|
|
writer = csv.writer(csvfile)
|
|
parser = wikitableparse(wikifile.read())
|
|
row = []
|
|
for lex in parser:
|
|
if lex.type == TableRow:
|
|
if row: writer.writerow(row)
|
|
row = []
|
|
elif lex.type in actions['singlerow']:
|
|
if row: writer.writerow(row)
|
|
writer.writerow([lex.raw])
|
|
row = []
|
|
elif lex.type in actions['data']:
|
|
row.append(lex.data)
|
|
elif lex.type in actions['raw']:
|
|
row.append(lex.raw)
|
|
if row:
|
|
writer.writerow(row)
|
|
|
|
|
|
rawtypes = re.compile('|'.join((re.escape(marker) for marker, type in wikitypes
|
|
if type in actions['raw'])))
|
|
singlerowtypes = re.compile('|'.join((re.escape(marker) for marker, type in wikitypes
|
|
if type in actions['singlerow'])))
|
|
|
|
def parsecsv(csvfile):
|
|
reader = csv.reader(csvfile)
|
|
newrow = False
|
|
for line in reader:
|
|
for cell in line:
|
|
if singlerowtypes.match(cell):
|
|
yield cell
|
|
break
|
|
elif rawtypes.match(cell):
|
|
yield cell
|
|
elif len(cell) and cell[0] in '-+}':
|
|
# Avoid a cornercase where a normal data cell has e.g. '-1' as content,
|
|
# which would result in a new row marker
|
|
yield '| '+cell
|
|
else:
|
|
yield '|'+cell
|
|
if not singlerowtypes.match(cell):
|
|
yield '|-'
|
|
|
|
def csv2wiki(csvfile, wikifile):
|
|
for cell in parsecsv(csvfile):
|
|
wikifile.write(cell+'\n')
|
|
|
|
|
|
def main():
|
|
|
|
progname = os.path.basename(sys.argv[0])
|
|
progname_cooked = os.path.splitext(progname)[0]
|
|
|
|
# to show the correct help text
|
|
towikidefault = tocsvdefault = ''
|
|
if progname_cooked == 'csv2wiki':
|
|
towikidefault = '(default for {}) '.format(progname)
|
|
description = "Convert SOURCE containing a table CSV format to Mediawikis wikitable syntax in DEST. Do the reverse if --tocsv is given."
|
|
else:
|
|
tocsvdefault = '(default for {}) '.format(progname)
|
|
description = "Convert SOURCE containing a table in Mediawikis wikitable syntax to Excel-readable CSV in DEST. Do the reverse if --towiki is given."
|
|
|
|
# parse arguments
|
|
parser = argparse.ArgumentParser(description=description)
|
|
parser.add_argument('-v', '--verbose', action='store_true', help="be more verbose")
|
|
|
|
direction = parser.add_mutually_exclusive_group()
|
|
direction.add_argument('--tocsv', '-c', action='store_true',
|
|
help=tocsvdefault+"Convert SOURCE from wikitable format to CSV in DEST")
|
|
direction.add_argument('--towiki', '-w', action='store_true',
|
|
help=towikidefault+"Convert SOURCE from CSV format back to wikitable format in DEST")
|
|
|
|
parser.add_argument('source', metavar='SOURCE', type=argparse.FileType('r'), nargs='?', default=sys.stdin,
|
|
help="The input file to read from. Omit or use '-' to read from stdin")
|
|
parser.add_argument('dest', metavar='DEST', type=argparse.FileType('w'), nargs='?', default=sys.stdout,
|
|
help="The file to write output to. Omit or use '-' to write to stdout")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.towiki:
|
|
direction = 'towiki'
|
|
elif args.tocsv:
|
|
direction = 'tocsv'
|
|
elif progname_cooked == 'csv2wiki':
|
|
direction = 'towiki'
|
|
else:
|
|
direction = 'tocsv'
|
|
|
|
if args.verbose:
|
|
print >>sys.stderr, 'direction=%s\n' % direction, 'source=%s\n' % args.source, 'dest=%s\n' % args.dest,
|
|
|
|
if direction == 'towiki':
|
|
csv2wiki(args.source, args.dest)
|
|
else:
|
|
wiki2csv(args.source, args.dest)
|
|
|
|
if args.verbose:
|
|
print >>sys.stderr, 'Conversion completed'
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|