#!/usr/bin/python3 # -*- tab-width: 2; indent-tabs-mode: t; -*- # Copyright 2012 Jan Kanis # License: GPL-3.0 # wiki2csv # # An explanation of this program is given in the accompanying README file. # This program is maintained at http://www.bitbucket.org/JanKanis/wiki2csv/ # If you find any bugs, you can report them there. # For command line options, see the help output of "wiki2csv.py --help". # See http://en.wikipedia.org/wiki/Help:Wikitable for the wikitable syntax. from collections import namedtuple import sys, re, os.path, argparse, csv Lexeme = namedtuple('Lexeme', 'type data raw') # the different lexeme types in the wiki table syntax class PreTable (object): # All text before the table starts gets this type pass class TableStart (object): pass class TableCaption (object): pass class TableRow (object): pass class TableHeader (object): pass class TableHeaderSinglerow (TableHeader): pass class TableHeaderContinued (TableHeader): pass class TableData (object): pass class TableDataSinglerow (TableData): pass class TableDataContinued (TableData): pass class TableEnd (object): pass # what should happen for each type actions = dict( # Store the item on a row of its own singlerow=(TableStart, TableCaption, TableEnd), # Store the data without the sytax marker data=(TableData,), # Store the full raw text raw=(TableHeader,) ) # associations between wiki syntax and types wikitypes = [ ('{|', TableStart), ('|+', TableCaption), ('|-', TableRow), ('|}', TableEnd), ('!', TableHeader), ('|', TableData), ] # a generator that returns Lexemes. Input is a single string with a wikitable. def wikitableparse(table): stable = table.split('\n') if not stable[-1]: del stable[-1] current = dict(type=PreTable, data='', raw='') for row in stable: srow = row.lstrip() for marker, type in wikitypes: if srow.startswith(marker): if current['type'] != PreTable: yield Lexeme(**current) current = dict(type=type, data=srow[len(marker):], raw=row) # process multiple cells on one line if current['type'] == TableData and '||' in current['data']: rows = current['raw'].split('||') yield Lexeme(type=TableDataSinglerow, data=rows[0].lstrip()[2:], raw=rows[0]) for r in rows[1:-1]: yield Lexeme(type=TableDataContinued, data=r, raw='||'+r) current = dict(type=TableDataContinued, data=r, raw='||'+r) # same for multiple header cells on one line if current['type'] == TableHeader and '!!' in current['data']: rows = current['raw'].split('!!') yield Lexeme(type=TableHeaderSinglerow, data=rows[0].lstrip()[2:], raw=rows[0]) for r in rows[1:-1]: yield Lexeme(type=TableHeaderContinued, data=r, raw='!!'+r) current = dict(type=TableHeaderContinued, data=r, raw='!!'+r) # Don't try to match again if we already hava a match break # continuation of previous lexeme on next line else: current['data'] += '\n' + row yield Lexeme(**current) def wiki2csv(wikifile, csvfile): writer = csv.writer(csvfile) parser = wikitableparse(wikifile.read()) row = [] for lex in parser: if lex.type == TableRow: if row: writer.writerow(row) row = [] elif lex.type in actions['singlerow']: if row: writer.writerow(row) writer.writerow([lex.raw]) row = [] elif lex.type in actions['data']: row.append(lex.data) elif lex.type in actions['raw']: row.append(lex.raw) if row: writer.writerow(row) rawtypes = re.compile('|'.join((re.escape(marker) for marker, type in wikitypes if type in actions['raw']))) singlerowtypes = re.compile('|'.join((re.escape(marker) for marker, type in wikitypes if type in actions['singlerow']))) def parsecsv(csvfile): reader = csv.reader(csvfile) newrow = False for line in reader: for cell in line: if singlerowtypes.match(cell): yield cell break elif rawtypes.match(cell): yield cell elif len(cell) and cell[0] in '-+}': # Avoid a cornercase where a normal data cell has e.g. '-1' as content, # which would result in a new row marker yield '| '+cell else: yield '|'+cell if not singlerowtypes.match(cell): yield '|-' def csv2wiki(csvfile, wikifile): for cell in parsecsv(csvfile): wikifile.write(cell+'\n') def main(): progname = os.path.basename(sys.argv[0]) progname_cooked = os.path.splitext(progname)[0] # to show the correct help text towikidefault = tocsvdefault = '' if progname_cooked == 'csv2wiki': towikidefault = '(default for {}) '.format(progname) description = "Convert SOURCE containing a table CSV format to Mediawikis wikitable syntax in DEST. Do the reverse if --tocsv is given." else: tocsvdefault = '(default for {}) '.format(progname) description = "Convert SOURCE containing a table in Mediawikis wikitable syntax to Excel-readable CSV in DEST. Do the reverse if --towiki is given." # parse arguments parser = argparse.ArgumentParser(description=description) parser.add_argument('-v', '--verbose', action='store_true', help="be more verbose") direction = parser.add_mutually_exclusive_group() direction.add_argument('--tocsv', '-c', action='store_true', help=tocsvdefault+"Convert SOURCE from wikitable format to CSV in DEST") direction.add_argument('--towiki', '-w', action='store_true', help=towikidefault+"Convert SOURCE from CSV format back to wikitable format in DEST") parser.add_argument('source', metavar='SOURCE', type=argparse.FileType('r'), nargs='?', default=sys.stdin, help="The input file to read from. Omit or use '-' to read from stdin") parser.add_argument('dest', metavar='DEST', type=argparse.FileType('w'), nargs='?', default=sys.stdout, help="The file to write output to. Omit or use '-' to write to stdout") args = parser.parse_args() if args.towiki: direction = 'towiki' elif args.tocsv: direction = 'tocsv' elif progname_cooked == 'csv2wiki': direction = 'towiki' else: direction = 'tocsv' if args.verbose: print >>sys.stderr, 'direction=%s\n' % direction, 'source=%s\n' % args.source, 'dest=%s\n' % args.dest, if direction == 'towiki': csv2wiki(args.source, args.dest) else: wiki2csv(args.source, args.dest) if args.verbose: print >>sys.stderr, 'Conversion completed' if __name__ == '__main__': main()