masutaka/html2json.rb

## html2json.rb
#!/usr/bin/env ruby

require 'json'
require 'oga'
require 'time'

action = {
  'index' => {
    '_index' => 'chalow',
    '_type'  => 'article'
  }
}

Dir.chdir(File.expand_path('..', __dir__)) do
  Dir.glob('webroot/chalow/*-*-*-*.html') do |file|
    File.open(file) do |f|
      parser = Oga::HTML::Parser.new(f)

      /(?<y>\d+)-(?<m>\d+)-(?<d>\d+)-(?<i>\d+)\.html/ =~ file
      id = "#{y}-#{m}-#{d}-#{i}"

      parser.parse.xpath('//div[@class="section"]').each do |article|
        title = article.at_xpath('h3/text()[1]').text.gsub('[', '').strip

        article.at_xpath('h3').remove
        article.at_xpath('div[@class="caption"]').remove
        article.xpath('blockquote[@class="twitter-tweet"]').tap(&:remove)
        article.at_xpath('script[contains(., "socialplus")]').remove
        article.at_xpath('id("google-adsense")').remove

        body = article.text.gsub("\n", '').strip

        document = {
          'id'         => id,
          'title'      => title,
          'body'       => body,
          '@timestamp' => Time.parse("#{y}-#{m}-#{d} #{i.to_i - 1}:00:00 +0900").strftime('%FT%T'),
        }

        puts action.to_json, document.to_json, ''
      end
    end
  end
end
	#!/usr/bin/env ruby

	require 'json'
	require 'oga'
	require 'time'

	action = {
	'index' => {
	'_index' => 'chalow',
	'_type' => 'article'
	}
	}

	Dir.chdir(File.expand_path('..', __dir__)) do
	Dir.glob('webroot/chalow/---.html') do \|file\|
	File.open(file) do \|f\|
	parser = Oga::HTML::Parser.new(f)

	/(?<y>\d+)-(?<m>\d+)-(?<d>\d+)-(?<i>\d+)\.html/ =~ file
	id = "#{y}-#{m}-#{d}-#{i}"

	parser.parse.xpath('//div[@class="section"]').each do \|article\|
	title = article.at_xpath('h3/text()[1]').text.gsub('[', '').strip

	article.at_xpath('h3').remove
	article.at_xpath('div[@class="caption"]').remove
	article.xpath('blockquote[@class="twitter-tweet"]').tap(&:remove)
	article.at_xpath('script[contains(., "socialplus")]').remove
	article.at_xpath('id("google-adsense")').remove

	body = article.text.gsub("\n", '').strip

	document = {
	'id' => id,
	'title' => title,
	'body' => body,
	'@timestamp' => Time.parse("#{y}-#{m}-#{d} #{i.to_i - 1}:00:00 +0900").strftime('%FT%T'),
	}

	puts action.to_json, document.to_json, ''
	end
	end
	end
	end
No results found