Created
May 20, 2020 02:36
-
-
Save jairopinilla/f514642bad90bbee65f3b4963aada56b to your computer and use it in GitHub Desktop.
UploadGoogleStore.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "name": "UploadGoogleStore.ipynb", | |
| "provenance": [], | |
| "collapsed_sections": [], | |
| "machine_shape": "hm", | |
| "mount_file_id": "1dekcKktQvgPTZUnfmGrGJAxKFSQRvERB", | |
| "authorship_tag": "ABX9TyNM3Ah4mpciAmC8XK19+DOz", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/jairopinilla/f514642bad90bbee65f3b4963aada56b/uploadgooglestore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "oVFul6B8wwDc", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "from io import BytesIO\n", | |
| "from zipfile import ZipFile\n", | |
| "from urllib.request import urlopen\n", | |
| "import requests\n", | |
| "from bs4 import BeautifulSoup\n", | |
| "import google.cloud.storage\n", | |
| "from google.cloud import storage\n", | |
| "from google.colab import drive\n", | |
| "from zipfile import is_zipfile\n", | |
| "import io" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "8VVD1q1rgzN4", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "The first part is to create a json key in google cloud\n", | |
| "https://cloud.google.com/iam/docs/creating-managing-service-account-keys" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "fIDKFlbChJ13", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "**Function scrapping**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "MAGv_PuyiPLA", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "def getListDonwload(url,baseurl):\n", | |
| " listUrl=[]\n", | |
| " r = requests.get(url)\n", | |
| " html = r.content\n", | |
| " soup = BeautifulSoup(html)\n", | |
| "\n", | |
| " for tr in soup.find_all('a'):\n", | |
| " item = baseurl + tr.text\n", | |
| " listUrl.append(item)\n", | |
| " return listUrl" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "E1hVA594kwVz", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "**Funcion upload file google storage**\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "tcIz4aVyk76X", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "def uploadGoogleStorage(subfolder,file_url,bucket):\n", | |
| " url = requests.get(file_url)\n", | |
| " zipbytes = io.BytesIO(url.content)\n", | |
| " \n", | |
| " if is_zipfile(zipbytes):\n", | |
| " with ZipFile(zipbytes, 'r') as myzip:\n", | |
| " for contentfilename in myzip.namelist():\n", | |
| " contentfile = myzip.read(contentfilename)\n", | |
| " blob = bucket.blob(subfolder + contentfilename)\n", | |
| " blob.upload_from_string(contentfile)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "_sI52pjGnHvW", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "**Define the main variables**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "pKiAo_89nOV9", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "KeyJsonStorage='/content/drive/My Drive/proyectomagister/analisisproyect-164f64768235.json'\n", | |
| "BucketName='analisisproyectst001'\n", | |
| "SubfolderBucket='data/' #/data\n", | |
| "ScrapWebPage='http://data.gdeltproject.org/events/index.html'\n", | |
| "BaseUrl='http://data.gdeltproject.org/events/'" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "dQdcREKLoAwi", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "storage_client = storage.Client.from_service_account_json(KeyJsonStorage)\n", | |
| "bucket = storage_client.get_bucket(BucketName)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "ZdI2v6DPnl-w", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "fileDownList = getListDonwload(ScrapWebPage,BaseUrl)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "tPPNb3vOo3il", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "for urlfile in fileDownList:\n", | |
| " \n", | |
| " try:\n", | |
| " if(urlfile!='http://data.gdeltproject.org/events/GDELT.MASTERREDUCEDV2.1979-2013.zip'):\n", | |
| " uploadGoogleStorage(SubfolderBucket,urlfile,bucket)\n", | |
| " print('correct:',urlfile)\n", | |
| " \n", | |
| " except:\n", | |
| " print('ERROR------------------------>',urlfile)\n" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| } | |
| ] | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Notebook to download a zip file, unzip and upload it to Google Cloud Storage.
References: https://stackoverflow.com/a/54378661