Last active
September 4, 2021 02:38
-
-
Save fb76100/616aef36f4200566dfba5ad2088428b7 to your computer and use it in GitHub Desktop.
Extract facebook Public Page information with Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Extract facebook Public Page information with in input a csv including a list of Public page url that you want to scrap | |
| and in output a csv file with all data scrapped (Python) | |
| If you want an xlsx file in input use pd.read_excel("name.xlsx", sheet_name="") instead. | |
| You'll need : | |
| bs4 (To parse Html file) | |
| pandas (use of DataFrames and writing to an Excel file) | |
| requests (Html client) | |
| ENJOY ! |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: FacebookScrapping | |
| channels: | |
| - defaults | |
| dependencies: | |
| - alabaster=0.7.12=py37_0 | |
| - appnope=0.1.0=py37_0 | |
| - asn1crypto=0.24.0=py37_0 | |
| - astroid=2.3.1=py37_0 | |
| - attrs=19.1.0=py37_1 | |
| - babel=2.7.0=py_0 | |
| - backcall=0.1.0=py37_0 | |
| - blas=1.0=mkl | |
| - bleach=3.1.0=py37_0 | |
| - ca-certificates=2019.8.28=0 | |
| - certifi=2019.9.11=py37_0 | |
| - cffi=1.12.3=py37hb5b8e2f_0 | |
| - chardet=3.0.4=py37_1003 | |
| - cloudpickle=1.2.2=py_0 | |
| - cryptography=2.7=py37ha12b0ac_0 | |
| - dbus=1.13.6=h90a0687_0 | |
| - decorator=4.4.0=py37_1 | |
| - defusedxml=0.6.0=py_0 | |
| - docutils=0.15.2=py37_0 | |
| - entrypoints=0.3=py37_0 | |
| - expat=2.2.6=h0a44026_0 | |
| - gettext=0.19.8.1=h15daf44_3 | |
| - glib=2.56.2=hd9629dc_0 | |
| - icu=58.2=h4b95b61_1 | |
| - idna=2.8=py37_0 | |
| - imagesize=1.1.0=py37_0 | |
| - intel-openmp=2019.4=233 | |
| - ipykernel=5.1.2=py37h39e3cac_0 | |
| - ipython=7.8.0=py37h39e3cac_0 | |
| - ipython_genutils=0.2.0=py37_0 | |
| - isort=4.3.21=py37_0 | |
| - jedi=0.15.1=py37_0 | |
| - jinja2=2.10.1=py37_0 | |
| - jpeg=9b=he5867d9_2 | |
| - jsonschema=3.0.2=py37_0 | |
| - jupyter_client=5.3.3=py37_1 | |
| - jupyter_core=4.5.0=py_0 | |
| - keyring=18.0.0=py37_0 | |
| - lazy-object-proxy=1.4.2=py37h1de35cc_0 | |
| - libcxx=4.0.1=hcfea43d_1 | |
| - libcxxabi=4.0.1=hcfea43d_1 | |
| - libedit=3.1.20181209=hb402a30_0 | |
| - libffi=3.2.1=h475c297_4 | |
| - libgfortran=3.0.1=h93005f0_2 | |
| - libiconv=1.15=hdd342a3_7 | |
| - libpng=1.6.37=ha441bb4_0 | |
| - libsodium=1.0.16=h3efe00b_0 | |
| - markupsafe=1.1.1=py37h1de35cc_0 | |
| - mccabe=0.6.1=py37_1 | |
| - mistune=0.8.4=py37h1de35cc_0 | |
| - mkl=2019.4=233 | |
| - mkl-service=2.3.0=py37hfbe908c_0 | |
| - mkl_fft=1.0.14=py37h5e564d8_0 | |
| - mkl_random=1.1.0=py37ha771720_0 | |
| - nbconvert=5.6.0=py37_1 | |
| - nbformat=4.4.0=py37_0 | |
| - ncurses=6.1=h0a44026_1 | |
| - numpy=1.17.2=py37h99e6662_0 | |
| - numpy-base=1.17.2=py37h6575580_0 | |
| - numpydoc=0.9.1=py_0 | |
| - openssl=1.1.1d=h1de35cc_2 | |
| - packaging=19.2=py_0 | |
| - pandas=0.25.1=py37h0a44026_0 | |
| - pandoc=2.2.3.2=0 | |
| - pandocfilters=1.4.2=py37_1 | |
| - parso=0.5.1=py_0 | |
| - pcre=8.43=h0a44026_0 | |
| - pexpect=4.7.0=py37_0 | |
| - pickleshare=0.7.5=py37_0 | |
| - pip=19.2.3=py37_0 | |
| - prompt_toolkit=2.0.9=py37_0 | |
| - psutil=5.6.3=py37h1de35cc_0 | |
| - ptyprocess=0.6.0=py37_0 | |
| - pycodestyle=2.5.0=py37_0 | |
| - pycparser=2.19=py37_0 | |
| - pyflakes=2.1.1=py37_0 | |
| - pygments=2.4.2=py_0 | |
| - pylint=2.4.2=py37_0 | |
| - pyopenssl=19.0.0=py37_0 | |
| - pyparsing=2.4.2=py_0 | |
| - pyqt=5.9.2=py37h655552a_2 | |
| - pyrsistent=0.15.4=py37h1de35cc_0 | |
| - pysocks=1.7.1=py37_0 | |
| - python=3.7.4=h359304d_1 | |
| - python-dateutil=2.8.0=py37_0 | |
| - python.app=2=py37_9 | |
| - pytz=2019.2=py_0 | |
| - pyzmq=18.1.0=py37h0a44026_0 | |
| - qt=5.9.7=h468cd18_1 | |
| - qtawesome=0.6.0=py_0 | |
| - qtconsole=4.5.5=py_0 | |
| - qtpy=1.9.0=py_0 | |
| - readline=7.0=h1de35cc_5 | |
| - requests=2.22.0=py37_0 | |
| - rope=0.14.0=py_0 | |
| - setuptools=41.2.0=py37_0 | |
| - sip=4.19.8=py37h0a44026_0 | |
| - six=1.12.0=py37_0 | |
| - snowballstemmer=1.9.1=py_0 | |
| - sphinx=2.2.0=py_0 | |
| - sphinxcontrib-applehelp=1.0.1=py_0 | |
| - sphinxcontrib-devhelp=1.0.1=py_0 | |
| - sphinxcontrib-htmlhelp=1.0.2=py_0 | |
| - sphinxcontrib-jsmath=1.0.1=py_0 | |
| - sphinxcontrib-qthelp=1.0.2=py_0 | |
| - sphinxcontrib-serializinghtml=1.1.3=py_0 | |
| - spyder=3.3.6=py37_0 | |
| - spyder-kernels=0.5.2=py37_0 | |
| - sqlite=3.30.0=ha441bb4_0 | |
| - testpath=0.4.2=py37_0 | |
| - tk=8.6.8=ha441bb4_0 | |
| - tornado=6.0.3=py37h1de35cc_0 | |
| - traitlets=4.3.2=py37_0 | |
| - urllib3=1.24.2=py37_0 | |
| - wcwidth=0.1.7=py37_0 | |
| - webencodings=0.5.1=py37_1 | |
| - wheel=0.33.6=py37_0 | |
| - wrapt=1.11.2=py37h1de35cc_0 | |
| - wurlitzer=1.0.3=py37_0 | |
| - xlrd=1.2.0=py37_0 | |
| - xz=5.2.4=h1de35cc_4 | |
| - zeromq=4.3.1=h0a44026_3 | |
| - zlib=1.2.11=h1de35cc_3 | |
| - pip: | |
| - beautifulsoup4==4.8.1 | |
| - cx-freeze==6.0 | |
| - et-xmlfile==1.0.1 | |
| - google==2.0.2 | |
| - google-search==1.0.2 | |
| - jdcal==1.4.1 | |
| - lxml==4.4.1 | |
| - openpyxl==3.0.0 | |
| - selenium==3.141.0 | |
| - soupsieve==1.9.4 | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| #Author : Faycal B | |
| #Last change 06/11/2019 | |
| #Requirements : | |
| #bs4 (To parse Html file) | |
| #pandas (use of DataFrames and writing to an Excel file) | |
| #requests (Html client) | |
| """ | |
| Spyder Editor | |
| Welcome to the extract Facebook Info Script :) | |
| This script take your facebook link from a csv in input and output a csv file with all infos | |
| """ | |
| from bs4 import BeautifulSoup | |
| import requests | |
| import re #to extract numbers from a string | |
| import pandas as pd | |
| #access to the website | |
| df1 = pd.read_csv("FbRemains.csv") | |
| #extract only the column Weblink | |
| df2 = df1["Weblink"] | |
| #Create a dataFrame | |
| df3 = pd.DataFrame( columns=['WebLink',"Last Name", "Company name",'Mail', | |
| 'Subject','Followers','Address','Website', "ZipCode","State" | |
| ,"City", "About"]) | |
| #increments the number of scrap results | |
| k=0 | |
| for i in df2: | |
| print(i) | |
| #create a dictionnary of images from facebook | |
| Imgs = {"Follower":"https://static.xx.fbcdn.net/rsrc.php/v3/y7/r/PL1sMLehMAU.png", | |
| "Subject":"https://static.xx.fbcdn.net/rsrc.php/v3/yl/r/LwDWwC1d0Rx.png", | |
| "Phone":"https://static.xx.fbcdn.net/rsrc.php/v3/yW/r/mYv88EsODOI.png", | |
| "Website":"https://static.xx.fbcdn.net/rsrc.php/v3/yx/r/xVA3lB-GVep.png", | |
| "Address":"https://static.xx.fbcdn.net/rsrc.php/v3/y5/r/vfXKA62x4Da.png", | |
| "Mail":"https://static.xx.fbcdn.net/rsrc.php/v3/yy/r/vKDzW_MdhyP.png" } | |
| page = i | |
| url = requests.get(page).text | |
| # parse the facebook page | |
| soup = BeautifulSoup(url, "lxml") | |
| # find the class "clearfix _ikh" where reposed all user info that we need | |
| Clearfix = soup.find_all("div", {"class":"clearfix _ikh"}) | |
| # stock UserInfo in dictionnary | |
| UserInfo = {} | |
| #extract company name | |
| if soup.find("span",{"class":"_50f7"}) != None : | |
| CompanyName = soup.find("span",{"class":"_50f7"}).text | |
| else: | |
| CompanyName = "N.A" | |
| for s in Clearfix : | |
| for I in Imgs: | |
| #Verify that the data match with what we want. Looking through | |
| if Imgs[I] == s.find("div", {"class":"_4bl7"}).find("img").attrs["src"]: | |
| UserInfo[I]=(s.find("div", {"class":"_4bl9"}).text) | |
| #Check if the webpage exist at least one follower we target | |
| if UserInfo.get("Follower") == None : | |
| continue | |
| #prettify Follower data, remove characters | |
| p = UserInfo["Follower"] | |
| u = re.sub(r"\s+", "",re.sub(r"[a-z]", "",p)) | |
| UserInfo["Follower"] = u | |
| if UserInfo.get("Address") != None: | |
| #prettify Address data | |
| Address = UserInfo["Address"].split("(")[0] | |
| ZipCode = UserInfo["Address"].split(")")[1].split(" ")[1] | |
| City = UserInfo["Address"].split(")")[1].split(" ")[2] | |
| State = UserInfo["Address"].split(")")[1].split(" ")[3] | |
| Address2 = {"address":Address, "zipcode": ZipCode, "city":City, "State":State} | |
| UserInfo["Address"] = Address | |
| UserInfo["ZipCode"]=ZipCode | |
| UserInfo["City"] = City | |
| UserInfo["State"] = State | |
| else : | |
| UserInfo["Address"] = "N.A" | |
| UserInfo["ZipCode"]="N.A" | |
| UserInfo["City"] = "N.A" | |
| UserInfo["State"] = "N.A" | |
| # go to about page | |
| url2 = requests.get(page+ "about").text | |
| texte="" | |
| soup2 = BeautifulSoup(url2, "lxml") | |
| Mail = soup2.find_all("div",{"class":"_5aj7 _3-8j"}) | |
| for m in Mail: | |
| #exclude errors None of the result | |
| if m.find("div",{"class":"_4bl7 _3-90 _a8s"}) != None: | |
| if Imgs["Mail"]== m.find("div",{"class":"_4bl7 _3-90 _a8s"}).find("img").attrs["src"]: | |
| UserInfo["Mail"]= m.find("div",{"class":"_4bl9"}).text | |
| if UserInfo.get("Mail") == None: | |
| UserInfo["Mail"] = "N.A" | |
| if UserInfo.get("Website") == None : | |
| UserInfo["Website"] = "N.A" | |
| #EXTRACT About info | |
| Info = soup2.find_all("div",{"class":"_3-8w"}) | |
| for a in Info: | |
| texte = texte + a.text | |
| UserInfo["Info"]= texte | |
| df3.loc[k]= (i, "Scaping", CompanyName, UserInfo["Mail"], UserInfo["Subject"], | |
| UserInfo["Follower"], UserInfo["Address"], UserInfo["Website"], UserInfo["ZipCode"], UserInfo["State"], | |
| UserInfo["City"], UserInfo["Info"] ) | |
| k+=1 | |
| #put it into a DataFrame | |
| #df = pd.DataFrame(UserInfo, index=[0]) | |
| #export to a csv file | |
| rd= df3.to_csv("InputCRM.csv", index=False) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment