Created
June 27, 2014 11:51
-
-
Save tbicr/cd584138ce183839946f to your computer and use it in GitHub Desktop.
pyparsing example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "metadata": { | |
| "name": "", | |
| "signature": "sha256:977a530fe07ef0f070bef944a04bc6a483233573b0bf17ae77e06a0d82e02e52" | |
| }, | |
| "nbformat": 3, | |
| "nbformat_minor": 0, | |
| "worksheets": [ | |
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Pyparsing - python parsing library\n", | |
| "\n", | |
| "Pavel Tysliatski\n", | |
| "\n", | |
| "Expansa Group" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Common\n", | |
| "\n", | |
| "Site: http://pyparsing.wikispaces.com/\n", | |
| "\n", | |
| "Common information: http://pyparsing.wikispaces.com/HowToUsePyparsing" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "from pyparsing import *\n", | |
| "from string import *" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 1 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## ParserElement subclasses" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Basic subclasses\n", | |
| "\n", | |
| " Literal\n", | |
| " Word\n", | |
| " Regex\n", | |
| " SkipTo\n", | |
| "\n", | |
| "and etc." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Expression subclasses\n", | |
| "\n", | |
| " Or - |\n", | |
| " And - ^\n", | |
| " Optional\n", | |
| " ZeroOrMore\n", | |
| " OneOrMore\n", | |
| "\n", | |
| "and etc." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Positional subclasses\n", | |
| "\n", | |
| " StringStart\n", | |
| " StringEnd\n", | |
| " LineStart\n", | |
| " LineEnd" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Converter subclasses\n", | |
| "\n", | |
| " Suppress\n", | |
| "\n", | |
| "and etc." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Parser actions\n", | |
| "\n", | |
| " setParserAction\n", | |
| " addParserAction" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Base example\n", | |
| "Make parser:\n", | |
| "\n", | |
| " [0-9]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def action(s, loc, toks):\n", | |
| " print('s', type(s), s)\n", | |
| " print('loc', type(loc), loc)\n", | |
| " print('toks', type(toks), toks)\n", | |
| " return ['<'] + list(toks) + ['>']\n", | |
| "\n", | |
| "\n", | |
| "pattern = Suppress('[') + Word(digits) + '-' + Word(digits).setParseAction(action) + Suppress(']')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 2 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Parsing methods" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### parseString" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "pattern.parseString('[0-9]')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "s <class 'str'> [0-9]\n", | |
| "loc <class 'int'> 3\n", | |
| "toks <class 'pyparsing.ParseResults'> ['9']\n" | |
| ] | |
| }, | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 3, | |
| "text": [ | |
| "(['0', '-', '<', '9', '>'], {})" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 3 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# >>> pattern.parseString('test[0-9]test')\n", | |
| "# ParseException: Expected \"[\" (at char 0), (line:1, col:1)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 4 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### searchString" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "pattern.searchString('test[0-9]test')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "s <class 'str'> test[0-9]test\n", | |
| "loc <class 'int'> 7\n", | |
| "toks <class 'pyparsing.ParseResults'> ['9']\n" | |
| ] | |
| }, | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 5, | |
| "text": [ | |
| "([(['0', '-', '<', '9', '>'], {})], {})" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 5 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "pattern.searchString('test[0-9]test[0-9]')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "s <class 'str'> test[0-9]test[0-9]\n", | |
| "loc <class 'int'> 7\n", | |
| "toks <class 'pyparsing.ParseResults'> ['9']\n", | |
| "s <class 'str'> test[0-9]test[0-9]\n", | |
| "loc <class 'int'> 16\n", | |
| "toks <class 'pyparsing.ParseResults'> ['9']\n" | |
| ] | |
| }, | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 6, | |
| "text": [ | |
| "([(['0', '-', '<', '9', '>'], {}), (['0', '-', '<', '9', '>'], {})], {})" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 6 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### scanString" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "list(pattern.scanString('test[0-9]test'))" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "s <class 'str'> test[0-9]test\n", | |
| "loc <class 'int'> 7\n", | |
| "toks <class 'pyparsing.ParseResults'> ['9']\n" | |
| ] | |
| }, | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 7, | |
| "text": [ | |
| "[((['0', '-', '<', '9', '>'], {}), 4, 9)]" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 7 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "list(pattern.scanString('test[0-9]test[0-9]'))" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "s <class 'str'> test[0-9]test[0-9]\n", | |
| "loc <class 'int'> 7\n", | |
| "toks <class 'pyparsing.ParseResults'> ['9']\n", | |
| "s <class 'str'> test[0-9]test[0-9]\n", | |
| "loc <class 'int'> 16\n", | |
| "toks <class 'pyparsing.ParseResults'> ['9']\n" | |
| ] | |
| }, | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 8, | |
| "text": [ | |
| "[((['0', '-', '<', '9', '>'], {}), 4, 9),\n", | |
| " ((['0', '-', '<', '9', '>'], {}), 13, 18)]" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 8 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### transformString" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "pattern.transformString('test[0-9]test[0-9]')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "s <class 'str'> test[0-9]test[0-9]\n", | |
| "loc <class 'int'> 7\n", | |
| "toks <class 'pyparsing.ParseResults'> ['9']\n", | |
| "s <class 'str'> test[0-9]test[0-9]\n", | |
| "loc <class 'int'> 16\n", | |
| "toks <class 'pyparsing.ParseResults'> ['9']\n" | |
| ] | |
| }, | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 9, | |
| "text": [ | |
| "'test0-<9>test0-<9>'" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 9 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## More complex example\n", | |
| "\n", | |
| "Make iterator:\n", | |
| "\n", | |
| " text_block[numeric_block][numeric_block]text_block...\n", | |
| " \n", | |
| "For example:\n", | |
| "\n", | |
| " >>> te\\[\\]st\\\\\\\\[0-9]test[0-9]\n", | |
| " te[]st\\\\0test0\n", | |
| " te[]st\\\\0test1\n", | |
| " te[]st\\\\0test2\n", | |
| " ...\n", | |
| " te[]st\\\\9test7\n", | |
| " te[]st\\\\9test8\n", | |
| " te[]st\\\\9test9" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Parser" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "text = SkipTo(StringEnd() | '[')\n", | |
| "numeric = Suppress('[') + Word(digits) + Suppress('-') + Word(digits) + Suppress(']')\n", | |
| "pattern = ZeroOrMore(text | numeric)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 10 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def skip_empty(toks):\n", | |
| " if not toks[0]:\n", | |
| " raise ParseException('must be not empty')\n", | |
| " \n", | |
| "\n", | |
| "text.addParseAction(skip_empty)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 11, | |
| "text": [ | |
| "SkipTo:({StringEnd | \"[\"})" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 11 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "pattern.parseString('test[0-9]test[0-9]')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 12, | |
| "text": [ | |
| "(['test', '0', '9', 'test', '0', '9'], {})" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 12 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "pattern.parseString('te\\[\\]st\\\\\\\\[0-9]test[0-9]')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 13, | |
| "text": [ | |
| "(['te\\\\'], {})" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 13 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Escaping" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "escape = (Literal('\\\\\\\\').addParseAction(replaceWith('\\\\')) | \n", | |
| " Literal('\\\\[').addParseAction(replaceWith('[')) |\n", | |
| " Literal('\\\\]').addParseAction(replaceWith(']')))\n", | |
| "text = SkipTo(StringEnd() | '[', ignore=escape).setParseAction(skip_empty)\n", | |
| "pattern = ZeroOrMore(text | numeric)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 14 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "pattern.parseString('te\\[\\]st\\\\\\\\[0-9]test[0-9]')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 15, | |
| "text": [ | |
| "(['te\\\\[\\\\]st\\\\\\\\', '0', '9', 'test', '0', '9'], {})" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 15 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def unescape(toks):\n", | |
| " return [escape.transformString(item) for item in toks]\n", | |
| "\n", | |
| "\n", | |
| "text.addParseAction(unescape)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 16, | |
| "text": [ | |
| "SkipTo:({StringEnd | \"[\"})" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 16 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "pattern.parseString('te\\[\\]st\\\\\\\\[0-9]test[0-9]')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 17, | |
| "text": [ | |
| "(['te[]st\\\\', '0', '9', 'test', '0', '9'], {})" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 17 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Iterator" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def text_block(toks):\n", | |
| " return [iter(toks)]\n", | |
| " \n", | |
| " \n", | |
| "def numeric_block(toks):\n", | |
| " from_value, to_value = toks\n", | |
| " return [map(str, range(int(from_value), int(to_value) + 1))]\n", | |
| "\n", | |
| "\n", | |
| "text.addParseAction(text_block)\n", | |
| "numeric.addParseAction(numeric_block)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 18, | |
| "text": [ | |
| "{Suppress:(\"[\") W:(0123...) Suppress:(\"-\") W:(0123...) Suppress:(\"]\")}" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 18 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "pattern.parseString('te\\[\\]st\\\\\\\\[0-9]test[0-9]')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 19, | |
| "text": [ | |
| "([<list_iterator object at 0x7f35380c7fd0>, <map object at 0x7f35381252b0>, <list_iterator object at 0x7f3538125278>, <map object at 0x7f3538100668>], {})" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 19 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "from itertools import product\n", | |
| "\n", | |
| "\n", | |
| "def iterator(string):\n", | |
| " return (''.join(items) for items in product(*pattern.parseString(string)))" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 20 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Test" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "len(list(iterator('te\\[\\]st\\\\\\\\[0-9]test[0-9]')))" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 21, | |
| "text": [ | |
| "100" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 21 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "list(iterator('te\\[\\]st\\\\\\\\[0-9]test[0-9]'))[:3]" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 22, | |
| "text": [ | |
| "['te[]st\\\\0test0', 'te[]st\\\\0test1', 'te[]st\\\\0test2']" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 22 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "list(iterator('te\\[\\]st\\\\\\\\[0-9]test[0-9]'))[-3:]" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 23, | |
| "text": [ | |
| "['te[]st\\\\9test7', 'te[]st\\\\9test8', 'te[]st\\\\9test9']" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 23 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Questions?\n", | |
| "\n", | |
| "pavel.tyslyatsky@gmail.com" | |
| ] | |
| } | |
| ], | |
| "metadata": {} | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment