robinsonkwame/extract.py

## extract.py
#The MIT License
#
#Copyright (c) 2016 Kwame Porter Robinson
#
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#
#The above copyright notice and this permission notice shall be included in
#all copies or substantial portions of the Software.
#
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
#THE SOFTWARE.

from bs4 import BeautifulSoup
import pandas as pd
import json

json_array = []

with open('January.html') as html:
    soup = BeautifulSoup(html, "html.parser")
    for content in soup.select('input[class*="text"]'):
        value = content.get('value')
        if value in ['Aggression',
                     'Bigotry',
                     'Rhetoric',
                     'Discrimination',
                     'Policy']:
            try:
                snippet = content.next_element.get_text()
            except AttributeError: # empty string
                continue
            else:
                if len(snippet) > 100: # all snippets are at least 101 chars
                    content_array = snippet.replace('\n\xA0', '')\
                                           .replace('\xA0', '')\
                                           .splitlines() # get rid of junk unicode &nbsp spaces

                    last_link = content.next_element.findAll('a')[-1]
                    link = None
                    if last_link.has_attr('href'):
                        link = last_link['href']

                    # There are variant and invariant indices into article content...
                    #
                    # invariants: title, link, source
                    content_array_len = len(content_array)

                    assert len(content_array) > 4, "Captured content has too few items!"

                    tmp_map = {'title': content_array[0],
                               'link': link,
                               'source': content_array[-1].split(':')[1]}

                    # variants: facts like when, who, (where) and snippet(s)

                    # ... facts, who:, where:, what:
                    facts = [1,2,3]
                    if len(content_array) == 5: # one less fact (missing Where:, typically)
                        facts = [1,2]

                    for head, tail in [content_array[idx].split(':', maxsplit=1) for idx in facts]:
                        tmp_map[head.lower()] = tail.strip().lower()

                    # ... and snippets (usually only one)
                    snippet_indices = range(facts[-1]+1, content_array_len - 1)
                    tmp_map['snippet'] = ' '.join([content_array[idx] for idx in snippet_indices])

                    # ... store off article content
                    json_array.append(tmp_map)

output = pd.DataFrame.from_dict(json_array, orient="columns")
output.to_csv('islamophobia_article_scrape.csv', sep='\t')
	#The MIT License
	#
	#Copyright (c) 2016 Kwame Porter Robinson
	#
	#Permission is hereby granted, free of charge, to any person obtaining a copy
	#of this software and associated documentation files (the "Software"), to deal
	#in the Software without restriction, including without limitation the rights
	#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	#copies of the Software, and to permit persons to whom the Software is
	#furnished to do so, subject to the following conditions:
	#
	#The above copyright notice and this permission notice shall be included in
	#all copies or substantial portions of the Software.
	#
	#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	#THE SOFTWARE.

	from bs4 import BeautifulSoup
	import pandas as pd
	import json

	json_array = []

	with open('January.html') as html:
	soup = BeautifulSoup(html, "html.parser")
	for content in soup.select('input[class*="text"]'):
	value = content.get('value')
	if value in ['Aggression',
	'Bigotry',
	'Rhetoric',
	'Discrimination',
	'Policy']:
	try:
	snippet = content.next_element.get_text()
	except AttributeError: # empty string
	continue
	else:
	if len(snippet) > 100: # all snippets are at least 101 chars
	content_array = snippet.replace('\n\xA0', '')\
	.replace('\xA0', '')\
	.splitlines() # get rid of junk unicode &nbsp spaces

	last_link = content.next_element.findAll('a')[-1]
	link = None
	if last_link.has_attr('href'):
	link = last_link['href']

	# There are variant and invariant indices into article content...
	#
	# invariants: title, link, source
	content_array_len = len(content_array)

	assert len(content_array) > 4, "Captured content has too few items!"

	tmp_map = {'title': content_array[0],
	'link': link,
	'source': content_array[-1].split(':')[1]}

	# variants: facts like when, who, (where) and snippet(s)

	# ... facts, who:, where:, what:
	facts = [1,2,3]
	if len(content_array) == 5: # one less fact (missing Where:, typically)
	facts = [1,2]

	for head, tail in [content_array[idx].split(':', maxsplit=1) for idx in facts]:
	tmp_map[head.lower()] = tail.strip().lower()

	# ... and snippets (usually only one)
	snippet_indices = range(facts[-1]+1, content_array_len - 1)
	tmp_map['snippet'] = ' '.join([content_array[idx] for idx in snippet_indices])

	# ... store off article content
	json_array.append(tmp_map)

	output = pd.DataFrame.from_dict(json_array, orient="columns")
	output.to_csv('islamophobia_article_scrape.csv', sep='\t')
No results found