add script to convert wiki tables to csv
This commit is contained in:
parent
7f231c2e5b
commit
6a828f4a52
1 changed files with 211 additions and 0 deletions
211
python/wiki2csv.py
Normal file
211
python/wiki2csv.py
Normal file
|
@ -0,0 +1,211 @@
|
|||
#!/usr/bin/python3
|
||||
# -*- tab-width: 2; indent-tabs-mode: t; -*-
|
||||
|
||||
# Copyright 2012 Jan Kanis
|
||||
# License: GPL-3.0
|
||||
|
||||
|
||||
# wiki2csv
|
||||
#
|
||||
# An explanation of this program is given in the accompanying README file.
|
||||
# This program is maintained at http://www.bitbucket.org/JanKanis/wiki2csv/
|
||||
# If you find any bugs, you can report them there.
|
||||
# For command line options, see the help output of "wiki2csv.py --help".
|
||||
# See http://en.wikipedia.org/wiki/Help:Wikitable for the wikitable syntax.
|
||||
|
||||
|
||||
from collections import namedtuple
|
||||
import sys, re, os.path, argparse, csv
|
||||
|
||||
|
||||
Lexeme = namedtuple('Lexeme', 'type data raw')
|
||||
|
||||
# the different lexeme types in the wiki table syntax
|
||||
class PreTable (object): # All text before the table starts gets this type
|
||||
pass
|
||||
class TableStart (object):
|
||||
pass
|
||||
class TableCaption (object):
|
||||
pass
|
||||
class TableRow (object):
|
||||
pass
|
||||
class TableHeader (object):
|
||||
pass
|
||||
class TableHeaderSinglerow (TableHeader):
|
||||
pass
|
||||
class TableHeaderContinued (TableHeader):
|
||||
pass
|
||||
class TableData (object):
|
||||
pass
|
||||
class TableDataSinglerow (TableData):
|
||||
pass
|
||||
class TableDataContinued (TableData):
|
||||
pass
|
||||
class TableEnd (object):
|
||||
pass
|
||||
|
||||
# what should happen for each type
|
||||
actions = dict(
|
||||
# Store the item on a row of its own
|
||||
singlerow=(TableStart, TableCaption, TableEnd),
|
||||
# Store the data without the sytax marker
|
||||
data=(TableData,),
|
||||
# Store the full raw text
|
||||
raw=(TableHeader,)
|
||||
)
|
||||
|
||||
# associations between wiki syntax and types
|
||||
wikitypes = [
|
||||
('{|', TableStart),
|
||||
('|+', TableCaption),
|
||||
('|-', TableRow),
|
||||
('|}', TableEnd),
|
||||
('!', TableHeader),
|
||||
('|', TableData),
|
||||
]
|
||||
|
||||
|
||||
# a generator that returns Lexemes. Input is a single string with a wikitable.
|
||||
def wikitableparse(table):
|
||||
stable = table.split('\n')
|
||||
if not stable[-1]:
|
||||
del stable[-1]
|
||||
current = dict(type=PreTable, data='', raw='')
|
||||
|
||||
for row in stable:
|
||||
srow = row.lstrip()
|
||||
for marker, type in wikitypes:
|
||||
if srow.startswith(marker):
|
||||
if current['type'] != PreTable:
|
||||
yield Lexeme(**current)
|
||||
current = dict(type=type, data=srow[len(marker):], raw=row)
|
||||
|
||||
# process multiple cells on one line
|
||||
if current['type'] == TableData and '||' in current['data']:
|
||||
rows = current['raw'].split('||')
|
||||
yield Lexeme(type=TableDataSinglerow, data=rows[0].lstrip()[2:], raw=rows[0])
|
||||
for r in rows[1:-1]:
|
||||
yield Lexeme(type=TableDataContinued, data=r, raw='||'+r)
|
||||
current = dict(type=TableDataContinued, data=r, raw='||'+r)
|
||||
|
||||
# same for multiple header cells on one line
|
||||
if current['type'] == TableHeader and '!!' in current['data']:
|
||||
rows = current['raw'].split('!!')
|
||||
yield Lexeme(type=TableHeaderSinglerow, data=rows[0].lstrip()[2:], raw=rows[0])
|
||||
for r in rows[1:-1]:
|
||||
yield Lexeme(type=TableHeaderContinued, data=r, raw='!!'+r)
|
||||
current = dict(type=TableHeaderContinued, data=r, raw='!!'+r)
|
||||
|
||||
# Don't try to match again if we already hava a match
|
||||
break
|
||||
|
||||
# continuation of previous lexeme on next line
|
||||
else:
|
||||
current['data'] += '\n' + row
|
||||
|
||||
yield Lexeme(**current)
|
||||
|
||||
|
||||
def wiki2csv(wikifile, csvfile):
|
||||
writer = csv.writer(csvfile)
|
||||
parser = wikitableparse(wikifile.read())
|
||||
row = []
|
||||
for lex in parser:
|
||||
if lex.type == TableRow:
|
||||
if row: writer.writerow(row)
|
||||
row = []
|
||||
elif lex.type in actions['singlerow']:
|
||||
if row: writer.writerow(row)
|
||||
writer.writerow([lex.raw])
|
||||
row = []
|
||||
elif lex.type in actions['data']:
|
||||
row.append(lex.data)
|
||||
elif lex.type in actions['raw']:
|
||||
row.append(lex.raw)
|
||||
if row:
|
||||
writer.writerow(row)
|
||||
|
||||
|
||||
rawtypes = re.compile('|'.join((re.escape(marker) for marker, type in wikitypes
|
||||
if type in actions['raw'])))
|
||||
singlerowtypes = re.compile('|'.join((re.escape(marker) for marker, type in wikitypes
|
||||
if type in actions['singlerow'])))
|
||||
|
||||
def parsecsv(csvfile):
|
||||
reader = csv.reader(csvfile)
|
||||
newrow = False
|
||||
for line in reader:
|
||||
for cell in line:
|
||||
if singlerowtypes.match(cell):
|
||||
yield cell
|
||||
break
|
||||
elif rawtypes.match(cell):
|
||||
yield cell
|
||||
elif len(cell) and cell[0] in '-+}':
|
||||
# Avoid a cornercase where a normal data cell has e.g. '-1' as content,
|
||||
# which would result in a new row marker
|
||||
yield '| '+cell
|
||||
else:
|
||||
yield '|'+cell
|
||||
if not singlerowtypes.match(cell):
|
||||
yield '|-'
|
||||
|
||||
def csv2wiki(csvfile, wikifile):
|
||||
for cell in parsecsv(csvfile):
|
||||
wikifile.write(cell+'\n')
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
progname = os.path.basename(sys.argv[0])
|
||||
progname_cooked = os.path.splitext(progname)[0]
|
||||
|
||||
# to show the correct help text
|
||||
towikidefault = tocsvdefault = ''
|
||||
if progname_cooked == 'csv2wiki':
|
||||
towikidefault = '(default for {}) '.format(progname)
|
||||
description = "Convert SOURCE containing a table CSV format to Mediawikis wikitable syntax in DEST. Do the reverse if --tocsv is given."
|
||||
else:
|
||||
tocsvdefault = '(default for {}) '.format(progname)
|
||||
description = "Convert SOURCE containing a table in Mediawikis wikitable syntax to Excel-readable CSV in DEST. Do the reverse if --towiki is given."
|
||||
|
||||
# parse arguments
|
||||
parser = argparse.ArgumentParser(description=description)
|
||||
parser.add_argument('-v', '--verbose', action='store_true', help="be more verbose")
|
||||
|
||||
direction = parser.add_mutually_exclusive_group()
|
||||
direction.add_argument('--tocsv', '-c', action='store_true',
|
||||
help=tocsvdefault+"Convert SOURCE from wikitable format to CSV in DEST")
|
||||
direction.add_argument('--towiki', '-w', action='store_true',
|
||||
help=towikidefault+"Convert SOURCE from CSV format back to wikitable format in DEST")
|
||||
|
||||
parser.add_argument('source', metavar='SOURCE', type=argparse.FileType('r'), nargs='?', default=sys.stdin,
|
||||
help="The input file to read from. Omit or use '-' to read from stdin")
|
||||
parser.add_argument('dest', metavar='DEST', type=argparse.FileType('w'), nargs='?', default=sys.stdout,
|
||||
help="The file to write output to. Omit or use '-' to write to stdout")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.towiki:
|
||||
direction = 'towiki'
|
||||
elif args.tocsv:
|
||||
direction = 'tocsv'
|
||||
elif progname_cooked == 'csv2wiki':
|
||||
direction = 'towiki'
|
||||
else:
|
||||
direction = 'tocsv'
|
||||
|
||||
if args.verbose:
|
||||
print >>sys.stderr, 'direction=%s\n' % direction, 'source=%s\n' % args.source, 'dest=%s\n' % args.dest,
|
||||
|
||||
if direction == 'towiki':
|
||||
csv2wiki(args.source, args.dest)
|
||||
else:
|
||||
wiki2csv(args.source, args.dest)
|
||||
|
||||
if args.verbose:
|
||||
print >>sys.stderr, 'Conversion completed'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Add table
Reference in a new issue