Last active
March 9, 2016 21:24
-
-
Save lshifr/f619437cbeebc368367e to your computer and use it in GitHub Desktop.
A very simplistic breadth - first HTML parser in Wolfram Mathematica
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| BeginPackage["HTMLParser`"] | |
| ParseHTML::usage = "ParseHTML[s] parses a string of HTML code"; | |
| HTMLContainer::usage = "HTMLContainer[tag] is an inert container for the contents of html tag"; | |
| Begin["`Private`"] | |
| listSplit[x_List,lengthlist_List,headlist_List]:= | |
| MapThread[ | |
| #1@@Take[x,#2]&, | |
| { | |
| headlist, | |
| Transpose[({Most[#1]+1,Rest[#1]}&)[FoldList[Plus,0,lengthlist]]] | |
| } | |
| ]; | |
| reconstructIntervals[listlen_Integer,ints_List]:= | |
| Module[{missed,startint,lastint}, | |
| startint=If[ints[[1,1]]==1,{},{1,ints[[1,1]]-1}]; | |
| lastint=If[ints[[-1,-1]]==listlen,{},{ints[[-1,-1]]+1,listlen}]; | |
| missed= | |
| (If[#1[[2,1]]-#1[[1,2]]>1,{#1[[1,2]]+1,#1[[2,1]]-1},{}]&)/@ | |
| Partition[ints,2,1]; | |
| missed=Join[missed,{lastint}]; | |
| Prepend[Flatten[Transpose[{ints,missed}],1],startint] | |
| ]; | |
| groupElements[lst_List,poslist_List,headlist_List] /; OrderedQ[ | |
| Flatten[Sort[poslist]] | |
| ]&&Length[headlist]==Length[poslist]:= | |
| Module[{totalheadlist,allints,llist}, | |
| totalheadlist= | |
| Append[ | |
| Flatten[ | |
| Transpose[{Array[Sequence&,{Length[headlist]}],headlist}],1 | |
| ], | |
| Sequence | |
| ]; | |
| allints=reconstructIntervals[Length[lst],poslist]; | |
| llist=(If[#1==={},0,1-Subtract@@#1]&)/@allints; | |
| listSplit[lst,llist,totalheadlist] | |
| ]; | |
| groupElements[h_[x__],poslist_List,headlist_List]:= | |
| h[Sequence@@groupElements[{x},poslist,headlist]]; | |
| groupElements[expr_,poslist_List,head_]:= | |
| groupElements[expr,poslist,Table[head,{Length[poslist]}]]; | |
| groupPositions[plist_List]:= | |
| Reap[(Sow[Last[#1],{Most[#1]}]&)/@plist,_,List][[2]]; | |
| processPosList::unmatchedMessageName="Unmatched lists `1` enountered!"; | |
| processPosList[{openlist_List,closelist_List}]:= | |
| Module[{opengroup,closegroup,poslist}, | |
| {opengroup,closegroup}=groupPositions/@{openlist,closelist}; | |
| poslist=Transpose[(Transpose[Sort[#1]]&)/@{opengroup,closegroup}]; | |
| If[ | |
| UnsameQ@@poslist[[1]] | |
| , | |
| Return[ | |
| Message[ | |
| processPosList::unmatchedMessageName,{openlist,closelist} | |
| ]; | |
| {} | |
| ] | |
| , | |
| (*else*) | |
| poslist=Transpose[{poslist[[1,1]],Transpose/@Transpose[poslist[[2]]]}] | |
| ] | |
| ]; | |
| groupElementsNested[nested_,{openposlist_List,closeposlist_List},head_] /; Head[head]=!=List:= | |
| Fold[ | |
| Function[{x,y},MapAt[groupElements[#1,y[[2]],head]&,x,{y[[1]]}]], | |
| nested, | |
| Sort[ | |
| processPosList[{openposlist,closeposlist}], | |
| Length[#2[[1]]]<Length[#1[[1]]]& | |
| ] | |
| ]; | |
| getAllUsedTags[text_String]:= | |
| Module[{htmlTagsposlist,result,chars=Characters[text],x}, | |
| htmlTagsposlist= | |
| StringPosition[ | |
| text,ShortestMatch["<"~~x__~~Whitespace|">"],Overlaps->True | |
| ]; | |
| result= | |
| Union[ | |
| ToLowerCase/@ | |
| Apply[ | |
| StringJoin, | |
| (Take[chars,{#1[[1]],#1[[2]]-1}]&)/@htmlTagsposlist, | |
| {1} | |
| ] /. {"<br/" :> "<br"} | |
| ] | |
| ]; | |
| refineTags[tags_List]:= | |
| Module[{alphabet=Characters["abcdefghijklmnopqrstuvwxyz/"]}, | |
| DeleteCases[tags,x_/;!MemberQ[alphabet,StringTake[x,{2,2}]]] | |
| ]; | |
| getTagTitle[tag_String]:= | |
| If[StringTake[tag,{2,2}]==="/",StringDrop[tag,{2}],tag]; | |
| getPairedTags[tags_List]:= | |
| Reverse/@ | |
| Select[Reap[(Sow[#1,getTagTitle[#1]]&)/@tags,_,#2&][[2]],Length[#1]==2&]; | |
| makeTagReplaceRules[pairedtags_List,unpairedtags_List]:= | |
| Sort[ | |
| Join[ | |
| Apply[Rule,({#1,{StringDrop[#1,1],"Unpaired","Open"}}&)/@unpairedtags,{1}], | |
| Apply[ | |
| Rule, | |
| Flatten[ | |
| (Transpose[{#1,{{StringDrop[#1[[1]],1],"Open"},{StringDrop[#1[[2]],2],"Close"}}}]&)/@ | |
| pairedtags, | |
| 1 | |
| ], | |
| {1} | |
| ], | |
| {">" -> {">","UnpairedClose"}} | |
| ], | |
| StringLength[#1[[2,1]]] > StringLength[#2[[2,1]]]& | |
| ]; | |
| getTagNames[pairedtags_List,unpairedtags_List]:= | |
| (StringDrop[#1,1]&)/@Join[Transpose[pairedtags][[1]],unpairedtags]; | |
| makeTagHashRules[tagnames_List]:= | |
| Dispatch[MapThread[Rule,({#1,Range[Length[#1]]}&)[tagnames]]]; | |
| tagSplit[text_String,{tagrules__Rule}]:= | |
| DeleteCases[ | |
| StringSplit[text,{tagrules}],x_/;StringMatchQ[x,Whitespace|""~~">"~~Whitespace|""] | |
| ]; | |
| splitText[text_String,pairedtags_List,unpairedtags_List]:= | |
| tagSplit[text,makeTagReplaceRules[pairedtags,unpairedtags]]; | |
| preparse[text_]:= | |
| Module[{step1}, | |
| With[{pos=Position[text,{_,"Open"}|{_,_,"Open"},\[Infinity]]}, | |
| step1= | |
| ReplacePart[ | |
| text, | |
| HTMLContainer["attr"]/@Extract[text,pos+1], | |
| pos+1, | |
| List/@Range[Length[pos]] | |
| ] | |
| ] | |
| ]; | |
| openCloseEnumerate[splittext_List,pairedtags_List,unpairedtags_List]:= | |
| Module[{tagnames,taghashrules,tagtitlecounters,unpairedstack={},temptag}, | |
| tagnames=getTagNames[pairedtags,unpairedtags]; | |
| taghashrules=makeTagHashRules[tagnames]; | |
| tagtitlecounters=Table[0,{Length[tagnames]}]; | |
| ( | |
| Switch[ | |
| #1, | |
| {x_,"Open"}, | |
| {#1[[1]],"Open",++tagtitlecounters[[#1[[1]]/.taghashrules]]}, | |
| {x_,"Close"}, | |
| {#1[[1]],"Close",tagtitlecounters[[#1[[1]]/.taghashrules]]--}, | |
| {x_,"Unpaired","Open"}, | |
| AppendTo[unpairedstack,#1]; | |
| {#1[[1]],"Open",++tagtitlecounters[[#1[[1]]/.taghashrules]]}, | |
| {"/>"|">","UnpairedClose"}, | |
| If[ | |
| Length[unpairedstack]>0 | |
| , | |
| temptag=unpairedstack[[-1]]; | |
| unpairedstack=Most[unpairedstack]; | |
| {temptag[[1]],"Close",tagtitlecounters[[temptag[[1]]/.taghashrules]]--} | |
| , | |
| (*else*) | |
| #1 | |
| ], | |
| _, | |
| #1 | |
| ]& | |
| )/@ | |
| splittext | |
| ]; | |
| getOpenCloseForm[text_String,pairedtags_List,unpairedtags_List]:= | |
| openCloseEnumerate[ | |
| preparse[splitText[text,pairedtags,unpairedtags]],pairedtags,unpairedtags | |
| ]; | |
| makeTagDepthList[opencloseform_List,pairedtags_List,unpairedtags_List]:= | |
| DeleteCases[ | |
| ({#1,Max[Cases[opencloseform,{#1,"Open"|"Close",x_Integer}:>x]]}&)/@ | |
| getTagNames[pairedtags,unpairedtags], | |
| {x_,-\[Infinity]} | |
| ]; | |
| oneStepParse[parsed_,depth_Integer,tag_String,head_]:= | |
| Module[{plist=(Position[parsed,{tag,#1,depth},\[Infinity]]&)/@{"Open","Close"}}, | |
| groupElementsNested[parsed,plist,head] | |
| ]; | |
| tagProcess[parseme_,{tag_String,maxdepth_Integer}]:= | |
| Module[{hd=HTMLContainer[tag],result}, | |
| With[{hd1=hd,ourtag=tag}, | |
| hd1[{ourtag,"Open",n_},x__,{ourtag,"Close",n_}]:= | |
| hd1[x]; | |
| result=Fold[oneStepParse[#1,#2,ourtag,hd1]&,parseme,Range[maxdepth,1,-1]] | |
| ]; | |
| Clear[hd]; | |
| result | |
| ]; | |
| openCloseProcess[opencloseform_List,pairedtags_List,unpairedtags_List]:= | |
| Fold[tagProcess,opencloseform,makeTagDepthList[opencloseform,pairedtags,unpairedtags]]; | |
| documentParse[text_String,pairedtags_List,unpairedtags_List]:= | |
| openCloseProcess[ | |
| getOpenCloseForm[text,pairedtags,unpairedtags],pairedtags,unpairedtags | |
| ]; | |
| refineParsed[parsed_]:= | |
| (If[#1==={},#1,First[#1]]&)[Cases[parsed,HTMLContainer["html"][___]]]; | |
| parseText[text_String]:= | |
| Module[{tags,paired,unpaired,parsed}, | |
| tags=refineTags[getAllUsedTags[text]]; | |
| paired=getPairedTags[tags]; | |
| unpaired=Complement[tags,Flatten[paired]]; | |
| parsed= | |
| refineParsed[documentParse[text,paired,unpaired]]/.{">","UnpairedClose"}:>">"; | |
| {parsed,paired,unpaired} | |
| ]; | |
| removeLeaves[parsed_]:= | |
| DeleteCases[parsed,_,{-1}]; | |
| postProcess[parsed_]:= | |
| DeleteCases[parsed,">"|"",\[Infinity]]; | |
| Options[ParseHTML] = { | |
| "PostProcess" -> False, | |
| "RemoveLeaves" -> False | |
| }; | |
| ParseHTML[html_String, opts:OptionsPattern[]]:= | |
| With[{ | |
| pp = If[TrueQ["PostProcess"], postProcess, Identity], | |
| rl = If[TrueQ["RemoveLeaves"], removeLeaves, Identity] | |
| }, | |
| pp @ rl @ parseText @ html | |
| ]; | |
| End[] | |
| EndPackage[] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment