- Unscheduled: Deep Web harvest will execute immediately (no delay parameter), and run once (scheduleType="ONCE" and no interval parameter)
{
"id": "string",
"harvestEventType": "DEEP",| #!/usr/bin/env python | |
| """ | |
| This script returns documents from the BrightPlanet REST API. Input is a text file with a list of queries. | |
| Output is a CSV file with your desired fields for each document. Default time period is everything until present. | |
| Requires 'requests' module. To install via cmd, enter: python -m pip install requests | |
| """ | |
| import requests | |
| import csv |
| import requests | |
| import csv | |
| input_file = r'YOUR_FULL_FILEPATH_HERE' | |
| var_scheduled = "RECURRING" | |
| var_initial_delay = 1.0 # float | |
| var_time_between_scheduled_events = 12.0 # float | |
| var_max_depth = 1 | |
| var_depth_external = 0 | |
| var_max_docsize = -1 |
| #!/usr/bin/env python | |
| import pyperclip | |
| import re | |
| from list_clipboard_manipulations import list_to_clipboard | |
| delete_counter = 0 | |
| good_list = list() | |
| sort_alpha = False |
| import requests | |
| infile = r'C:\Users\Account\PythonFiles\generic_infile.txt' # full path to any file inside quotes | |
| # Harvest Event Variables | |
| api_key = "123abc" # STRING - 1 API key per Harvest API schema | |
| searchable_items_per_event = 100 # INT - max queries OR max screenNames | |
| name_of_event = "NewYork_Politics" # STRING - Program will pre-pend "TW_" and add "_#" to the end | |
| filterQuery = None # STRING - ex: "nuclear AND (war OR energy)" | |
| event_tags = ["source_Politics", "New York"] # LIST |
| EXAMPLE JSON PAYLOADS FOR BRIGHTPLANET HARVEST API | |
| ================================================= | |
| 1. Website harvest - scraping search results pages | |
| 2. Website harvest - harvesting a list of URLs, includes Xpath overwrite and Date-finding Xpath | |
| 3. Website harvest - scheduled harvest to monitor new documents | |
| 4. Deep Web harvest - query search engines (USE SPARINGLY - rate limits) | |
| 5. Deep Web harvest - query sources from multiple source groups | |
| 6. RSS harvest - monitor new documents daily using RSS feeds, includes Xpath overwrite and Date-finding Xpath | |
| 7. XPATH expressions - use these xpaths to manipulate which text is harvested from a web page | |
| ================================================= |
| #!/usr/bin/env python | |
| import pyperclip | |
| example_list = ["Line 1", "Line 2", "Line 3", "forever and ever"] | |
| def list_to_clipboard(output_list): | |
| """ Check if len(list) > 0, then copy to clipboard """ | |
| if len(output_list) > 0: | |
| pyperclip.copy('\n'.join(output_list)) |
| #!/usr/bin/env python | |
| import re | |
| input_file = 'infile.txt' # enter full file path, precede string with 'r' (r'PATH') if using Windows | |
| output_file = 'outfile.txt' # enter full file path, precede string with 'r' (r'PATH') if using Windows | |
| delete_counter = 0 | |
| # list of individual regex, which will be combined into a single regex in the next step |