Manuel Albarran

A random notes about Elixir

Naughty parser for HTML based on Regex and Lists

22 Feb 2017 » elixir, parser, html, regex, lists

mini HTML parser for Elixir based on regex and lists, with finder by id.

defmodule FoxyHTML do
  @re_html ~r{
    (</[a-zA-Z]+[^>]*>)                 #close tag
    |(<[a-zA-Z]+(?:[^/>]|/[^>])*/>)     #single tag
    |(<[a-zA-Z]+[^>]*>)                 #tag
    |([^<]+)                            #notag
    |(<!--.*?-->)                       #|(<![^>]*>) #comment
    |(.)                                #other
  }imx

  @re_tag ~r/<([a-zA-Z]+[0-9]*)/m
  @re_tag_id ~r/id=(("[^"]*")|('[^']*')|[^\s>]+)/m
  @re_closetag ~r{</([a-zA-Z]+[0-9]*)}m

  @singles ~w(meta img link input area base col br hr)  

  def parse(html) do
    @re_html |> Regex.scan(html, capture: :all_but_first) |> Enum.map(&build/1)
  end

  defp build([close_tag]) do
    {:close_tag, close_tag, close_tag |> capture_first(@re_closetag), nil}
  end

  defp build([_, single_tag]) do
    {:single_tag, single_tag, single_tag |> capture_first(@re_tag), single_tag |> capture_first(@re_tag_id)}
  end

  defp build([_, _, tag]) do
    name = tag |> capture_first(@re_tag)
    type = if name in @singles do :single_tag else :tag end
    {type, tag, name, tag |> capture_first(@re_tag_id)}
  end

  defp build([_, _, _, no_tag]) do
    {:no_tag, no_tag, nil, nil}
  end

  defp build([_, _, _, _, comment]) do
    {:comment, comment, nil, nil}
  end

  defp build([_, _, _, _, _, other]) do
    {:other, other, nil, nil}
  end

  defp capture_first(str, regex) do
    case regex |> Regex.run(str, capture: :all_but_first) do
      nil -> nil
      list -> list  |> List.first |> String.trim("\"") #" # Fixing syntax highlighting
    end
  end

  def find(list, fun) do
    case make_find(list, fun) do
      {:ok, result} -> result
      _ -> nil
    end
  end

  defp make_find(list, fun) do
    list |> Enum.reduce_while({false, nil, [], []}, fn tuple, {matched, stack_for_return, stack, return} -> 
      name = elem(tuple, 2)
      matched = matched || fun.(tuple)
      stack_for_return = if matched && !stack_for_return do stack else stack_for_return end
      return = if matched do [tuple | return] else return end

      stack = if open?(tuple) do [name | stack] else stack end
      stack = if close?(tuple) do stack |> drop_while(name) else stack end

      if stack_for_return == stack do
        {:halt, {:ok, return |> Enum.reverse }}
      else
        {:cont, {matched, stack_for_return, stack, return}}
      end
    end)
  end

  defp drop_while([name|tail], name), do: tail
  defp drop_while([_|tail], name), do: drop_while(tail, name)
  defp drop_while([], name), do: []

  defp close?({:close_tag, _, _, _}), do: true
  # we dont need this, becaouse will be unstacked in the next statement
  #defp close?({:single_tag, _, _, _}), do: true
  defp close?({_, _, _, _}), do: false

  defp open?({:tag, _, _, _}), do: true
  #defp open?({:single_tag, _, _, _}), do: true
  defp open?({_, _, _, _}), do: false

  def to_string(list) do
    list |> Enum.map(fn {_, content, _, _} -> content end) |> Enum.join
  end
end

IO.inspect(
 "<div>Lorem<img src=\"image.jpg\"/>ipsum dolor <span id=\"needle\">sit <img src=\"secret.jpg\"> </span> amet</div>"
  |> FoxyHTML.parse
  |> FoxyHTML.find(fn {_, _, _, id} -> id == "needle" end)
  |> FoxyHTML.to_string
)
#> "<span id=\"needle\">sit <img src=\"secret.jpg\"> </span>"