fb76100/FacebookScrapping.py

## readme.txt
Extract facebook Public Page information with in input a csv including a list of Public page url that you want to scrap
and in output a csv file with all data scrapped (Python)

If you want an xlsx file in input use pd.read_excel("name.xlsx", sheet_name="") instead.

You'll need :
bs4 (To parse Html file)
pandas (use of DataFrames and writing to an Excel file)
requests (Html client)

ENJOY !

## environnement.yml
name: FacebookScrapping
channels:
  - defaults
dependencies:
  - alabaster=0.7.12=py37_0
  - appnope=0.1.0=py37_0
  - asn1crypto=0.24.0=py37_0
  - astroid=2.3.1=py37_0
  - attrs=19.1.0=py37_1
  - babel=2.7.0=py_0
  - backcall=0.1.0=py37_0
  - blas=1.0=mkl
  - bleach=3.1.0=py37_0
  - ca-certificates=2019.8.28=0
  - certifi=2019.9.11=py37_0
  - cffi=1.12.3=py37hb5b8e2f_0
  - chardet=3.0.4=py37_1003
  - cloudpickle=1.2.2=py_0
  - cryptography=2.7=py37ha12b0ac_0
  - dbus=1.13.6=h90a0687_0
  - decorator=4.4.0=py37_1
  - defusedxml=0.6.0=py_0
  - docutils=0.15.2=py37_0
  - entrypoints=0.3=py37_0
  - expat=2.2.6=h0a44026_0
  - gettext=0.19.8.1=h15daf44_3
  - glib=2.56.2=hd9629dc_0
  - icu=58.2=h4b95b61_1
  - idna=2.8=py37_0
  - imagesize=1.1.0=py37_0
  - intel-openmp=2019.4=233
  - ipykernel=5.1.2=py37h39e3cac_0
  - ipython=7.8.0=py37h39e3cac_0
  - ipython_genutils=0.2.0=py37_0
  - isort=4.3.21=py37_0
  - jedi=0.15.1=py37_0
  - jinja2=2.10.1=py37_0
  - jpeg=9b=he5867d9_2
  - jsonschema=3.0.2=py37_0
  - jupyter_client=5.3.3=py37_1
  - jupyter_core=4.5.0=py_0
  - keyring=18.0.0=py37_0
  - lazy-object-proxy=1.4.2=py37h1de35cc_0
  - libcxx=4.0.1=hcfea43d_1
  - libcxxabi=4.0.1=hcfea43d_1
  - libedit=3.1.20181209=hb402a30_0
  - libffi=3.2.1=h475c297_4
  - libgfortran=3.0.1=h93005f0_2
  - libiconv=1.15=hdd342a3_7
  - libpng=1.6.37=ha441bb4_0
  - libsodium=1.0.16=h3efe00b_0
  - markupsafe=1.1.1=py37h1de35cc_0
  - mccabe=0.6.1=py37_1
  - mistune=0.8.4=py37h1de35cc_0
  - mkl=2019.4=233
  - mkl-service=2.3.0=py37hfbe908c_0
  - mkl_fft=1.0.14=py37h5e564d8_0
  - mkl_random=1.1.0=py37ha771720_0
  - nbconvert=5.6.0=py37_1
  - nbformat=4.4.0=py37_0
  - ncurses=6.1=h0a44026_1
  - numpy=1.17.2=py37h99e6662_0
  - numpy-base=1.17.2=py37h6575580_0
  - numpydoc=0.9.1=py_0
  - openssl=1.1.1d=h1de35cc_2
  - packaging=19.2=py_0
  - pandas=0.25.1=py37h0a44026_0
  - pandoc=2.2.3.2=0
  - pandocfilters=1.4.2=py37_1
  - parso=0.5.1=py_0
  - pcre=8.43=h0a44026_0
  - pexpect=4.7.0=py37_0
  - pickleshare=0.7.5=py37_0
  - pip=19.2.3=py37_0
  - prompt_toolkit=2.0.9=py37_0
  - psutil=5.6.3=py37h1de35cc_0
  - ptyprocess=0.6.0=py37_0
  - pycodestyle=2.5.0=py37_0
  - pycparser=2.19=py37_0
  - pyflakes=2.1.1=py37_0
  - pygments=2.4.2=py_0
  - pylint=2.4.2=py37_0
  - pyopenssl=19.0.0=py37_0
  - pyparsing=2.4.2=py_0
  - pyqt=5.9.2=py37h655552a_2
  - pyrsistent=0.15.4=py37h1de35cc_0
  - pysocks=1.7.1=py37_0
  - python=3.7.4=h359304d_1
  - python-dateutil=2.8.0=py37_0
  - python.app=2=py37_9
  - pytz=2019.2=py_0
  - pyzmq=18.1.0=py37h0a44026_0
  - qt=5.9.7=h468cd18_1
  - qtawesome=0.6.0=py_0
  - qtconsole=4.5.5=py_0
  - qtpy=1.9.0=py_0
  - readline=7.0=h1de35cc_5
  - requests=2.22.0=py37_0
  - rope=0.14.0=py_0
  - setuptools=41.2.0=py37_0
  - sip=4.19.8=py37h0a44026_0
  - six=1.12.0=py37_0
  - snowballstemmer=1.9.1=py_0
  - sphinx=2.2.0=py_0
  - sphinxcontrib-applehelp=1.0.1=py_0
  - sphinxcontrib-devhelp=1.0.1=py_0
  - sphinxcontrib-htmlhelp=1.0.2=py_0
  - sphinxcontrib-jsmath=1.0.1=py_0
  - sphinxcontrib-qthelp=1.0.2=py_0
  - sphinxcontrib-serializinghtml=1.1.3=py_0
  - spyder=3.3.6=py37_0
  - spyder-kernels=0.5.2=py37_0
  - sqlite=3.30.0=ha441bb4_0
  - testpath=0.4.2=py37_0
  - tk=8.6.8=ha441bb4_0
  - tornado=6.0.3=py37h1de35cc_0
  - traitlets=4.3.2=py37_0
  - urllib3=1.24.2=py37_0
  - wcwidth=0.1.7=py37_0
  - webencodings=0.5.1=py37_1
  - wheel=0.33.6=py37_0
  - wrapt=1.11.2=py37h1de35cc_0
  - wurlitzer=1.0.3=py37_0
  - xlrd=1.2.0=py37_0
  - xz=5.2.4=h1de35cc_4
  - zeromq=4.3.1=h0a44026_3
  - zlib=1.2.11=h1de35cc_3
  - pip:
    - beautifulsoup4==4.8.1
    - cx-freeze==6.0
    - et-xmlfile==1.0.1
    - google==2.0.2
    - google-search==1.0.2
    - jdcal==1.4.1
    - lxml==4.4.1
    - openpyxl==3.0.0
    - selenium==3.141.0
    - soupsieve==1.9.4


## FacebookScrapping.py
# -*- coding: utf-8 -*-
#Author : Faycal B
#Last change 06/11/2019
#Requirements :
#bs4 (To parse Html file)
#pandas (use of DataFrames and writing to an Excel file)
#requests (Html client)

"""
Spyder Editor

Welcome to the extract Facebook Info Script :)
This script take your facebook link from a csv in input and output a csv file with all infos
"""

from bs4 import BeautifulSoup
import requests
import re                #to extract numbers from a string
import pandas as pd


#access to the website
df1 = pd.read_csv("FbRemains.csv")
#extract only the column Weblink
df2 = df1["Weblink"]
#Create a dataFrame
df3 = pd.DataFrame( columns=['WebLink',"Last Name", "Company name",'Mail',
                                 'Subject','Followers','Address','Website', "ZipCode","State"
                                 ,"City", "About"])

#increments the number of scrap results
k=0
for i in df2:
    print(i)
    #create a dictionnary of images from facebook
    Imgs = {"Follower":"https://static.xx.fbcdn.net/rsrc.php/v3/y7/r/PL1sMLehMAU.png",
        "Subject":"https://static.xx.fbcdn.net/rsrc.php/v3/yl/r/LwDWwC1d0Rx.png",
        "Phone":"https://static.xx.fbcdn.net/rsrc.php/v3/yW/r/mYv88EsODOI.png",
        "Website":"https://static.xx.fbcdn.net/rsrc.php/v3/yx/r/xVA3lB-GVep.png",
        "Address":"https://static.xx.fbcdn.net/rsrc.php/v3/y5/r/vfXKA62x4Da.png",
        "Mail":"https://static.xx.fbcdn.net/rsrc.php/v3/yy/r/vKDzW_MdhyP.png" }
    page = i
    url = requests.get(page).text

    # parse the facebook page
    soup = BeautifulSoup(url, "lxml")

    # find the class "clearfix _ikh" where reposed all user info that we need
    Clearfix = soup.find_all("div", {"class":"clearfix _ikh"})

    # stock UserInfo in dictionnary
    UserInfo = {}

    #extract company name
    if soup.find("span",{"class":"_50f7"}) != None :
        CompanyName = soup.find("span",{"class":"_50f7"}).text
    else:
        CompanyName = "N.A"


    for s in Clearfix :
        for I in Imgs:
            #Verify that the data match with what we want. Looking through
            if Imgs[I] == s.find("div", {"class":"_4bl7"}).find("img").attrs["src"]:
                UserInfo[I]=(s.find("div", {"class":"_4bl9"}).text)

    #Check if the webpage exist at least one follower we target
    if UserInfo.get("Follower") == None :
        continue
    #prettify Follower data, remove characters
    p = UserInfo["Follower"]
    u = re.sub(r"\s+", "",re.sub(r"[a-z]", "",p))
    UserInfo["Follower"] = u

    if UserInfo.get("Address") != None:

        #prettify Address data
        Address = UserInfo["Address"].split("(")[0]
        ZipCode = UserInfo["Address"].split(")")[1].split(" ")[1]
        City = UserInfo["Address"].split(")")[1].split(" ")[2]
        State = UserInfo["Address"].split(")")[1].split(" ")[3]
        Address2 = {"address":Address, "zipcode": ZipCode, "city":City, "State":State}
        UserInfo["Address"] = Address
        UserInfo["ZipCode"]=ZipCode
        UserInfo["City"] = City
        UserInfo["State"] = State
    else :
        UserInfo["Address"] = "N.A"
        UserInfo["ZipCode"]="N.A"
        UserInfo["City"] = "N.A"
        UserInfo["State"] = "N.A"
    # go to about page
    url2 = requests.get(page+ "about").text
    texte=""
    soup2 = BeautifulSoup(url2, "lxml")

    Mail = soup2.find_all("div",{"class":"_5aj7 _3-8j"})
    for m in Mail:
        #exclude errors None of the result
        if m.find("div",{"class":"_4bl7 _3-90 _a8s"}) != None:
            if Imgs["Mail"]== m.find("div",{"class":"_4bl7 _3-90 _a8s"}).find("img").attrs["src"]:
                UserInfo["Mail"]= m.find("div",{"class":"_4bl9"}).text

    if UserInfo.get("Mail") == None:
        UserInfo["Mail"] = "N.A"
    if UserInfo.get("Website") == None :
        UserInfo["Website"] = "N.A"
    #EXTRACT About info
    Info = soup2.find_all("div",{"class":"_3-8w"})
    for a in Info:
        texte = texte + a.text
    UserInfo["Info"]= texte

    df3.loc[k]= (i, "Scaping", CompanyName, UserInfo["Mail"], UserInfo["Subject"],
           UserInfo["Follower"], UserInfo["Address"], UserInfo["Website"], UserInfo["ZipCode"], UserInfo["State"],
           UserInfo["City"], UserInfo["Info"] )
    k+=1
#put it into a DataFrame
#df = pd.DataFrame(UserInfo, index=[0])

#export to a csv file
rd= df3.to_csv("InputCRM.csv", index=False)
	Extract facebook Public Page information with in input a csv including a list of Public page url that you want to scrap
	and in output a csv file with all data scrapped (Python)

	If you want an xlsx file in input use pd.read_excel("name.xlsx", sheet_name="") instead.

	You'll need :
	bs4 (To parse Html file)
	pandas (use of DataFrames and writing to an Excel file)
	requests (Html client)

	ENJOY !
	name: FacebookScrapping
	channels:
	- defaults
	dependencies:
	- alabaster=0.7.12=py37_0
	- appnope=0.1.0=py37_0
	- asn1crypto=0.24.0=py37_0
	- astroid=2.3.1=py37_0
	- attrs=19.1.0=py37_1
	- babel=2.7.0=py_0
	- backcall=0.1.0=py37_0
	- blas=1.0=mkl
	- bleach=3.1.0=py37_0
	- ca-certificates=2019.8.28=0
	- certifi=2019.9.11=py37_0
	- cffi=1.12.3=py37hb5b8e2f_0
	- chardet=3.0.4=py37_1003
	- cloudpickle=1.2.2=py_0
	- cryptography=2.7=py37ha12b0ac_0
	- dbus=1.13.6=h90a0687_0
	- decorator=4.4.0=py37_1
	- defusedxml=0.6.0=py_0
	- docutils=0.15.2=py37_0
	- entrypoints=0.3=py37_0
	- expat=2.2.6=h0a44026_0
	- gettext=0.19.8.1=h15daf44_3
	- glib=2.56.2=hd9629dc_0
	- icu=58.2=h4b95b61_1
	- idna=2.8=py37_0
	- imagesize=1.1.0=py37_0
	- intel-openmp=2019.4=233
	- ipykernel=5.1.2=py37h39e3cac_0
	- ipython=7.8.0=py37h39e3cac_0
	- ipython_genutils=0.2.0=py37_0
	- isort=4.3.21=py37_0
	- jedi=0.15.1=py37_0
	- jinja2=2.10.1=py37_0
	- jpeg=9b=he5867d9_2
	- jsonschema=3.0.2=py37_0
	- jupyter_client=5.3.3=py37_1
	- jupyter_core=4.5.0=py_0
	- keyring=18.0.0=py37_0
	- lazy-object-proxy=1.4.2=py37h1de35cc_0
	- libcxx=4.0.1=hcfea43d_1
	- libcxxabi=4.0.1=hcfea43d_1
	- libedit=3.1.20181209=hb402a30_0
	- libffi=3.2.1=h475c297_4
	- libgfortran=3.0.1=h93005f0_2
	- libiconv=1.15=hdd342a3_7
	- libpng=1.6.37=ha441bb4_0
	- libsodium=1.0.16=h3efe00b_0
	- markupsafe=1.1.1=py37h1de35cc_0
	- mccabe=0.6.1=py37_1
	- mistune=0.8.4=py37h1de35cc_0
	- mkl=2019.4=233
	- mkl-service=2.3.0=py37hfbe908c_0
	- mkl_fft=1.0.14=py37h5e564d8_0
	- mkl_random=1.1.0=py37ha771720_0
	- nbconvert=5.6.0=py37_1
	- nbformat=4.4.0=py37_0
	- ncurses=6.1=h0a44026_1
	- numpy=1.17.2=py37h99e6662_0
	- numpy-base=1.17.2=py37h6575580_0
	- numpydoc=0.9.1=py_0
	- openssl=1.1.1d=h1de35cc_2
	- packaging=19.2=py_0
	- pandas=0.25.1=py37h0a44026_0
	- pandoc=2.2.3.2=0
	- pandocfilters=1.4.2=py37_1
	- parso=0.5.1=py_0
	- pcre=8.43=h0a44026_0
	- pexpect=4.7.0=py37_0
	- pickleshare=0.7.5=py37_0
	- pip=19.2.3=py37_0
	- prompt_toolkit=2.0.9=py37_0
	- psutil=5.6.3=py37h1de35cc_0
	- ptyprocess=0.6.0=py37_0
	- pycodestyle=2.5.0=py37_0
	- pycparser=2.19=py37_0
	- pyflakes=2.1.1=py37_0
	- pygments=2.4.2=py_0
	- pylint=2.4.2=py37_0
	- pyopenssl=19.0.0=py37_0
	- pyparsing=2.4.2=py_0
	- pyqt=5.9.2=py37h655552a_2
	- pyrsistent=0.15.4=py37h1de35cc_0
	- pysocks=1.7.1=py37_0
	- python=3.7.4=h359304d_1
	- python-dateutil=2.8.0=py37_0
	- python.app=2=py37_9
	- pytz=2019.2=py_0
	- pyzmq=18.1.0=py37h0a44026_0
	- qt=5.9.7=h468cd18_1
	- qtawesome=0.6.0=py_0
	- qtconsole=4.5.5=py_0
	- qtpy=1.9.0=py_0
	- readline=7.0=h1de35cc_5
	- requests=2.22.0=py37_0
	- rope=0.14.0=py_0
	- setuptools=41.2.0=py37_0
	- sip=4.19.8=py37h0a44026_0
	- six=1.12.0=py37_0
	- snowballstemmer=1.9.1=py_0
	- sphinx=2.2.0=py_0
	- sphinxcontrib-applehelp=1.0.1=py_0
	- sphinxcontrib-devhelp=1.0.1=py_0
	- sphinxcontrib-htmlhelp=1.0.2=py_0
	- sphinxcontrib-jsmath=1.0.1=py_0
	- sphinxcontrib-qthelp=1.0.2=py_0
	- sphinxcontrib-serializinghtml=1.1.3=py_0
	- spyder=3.3.6=py37_0
	- spyder-kernels=0.5.2=py37_0
	- sqlite=3.30.0=ha441bb4_0
	- testpath=0.4.2=py37_0
	- tk=8.6.8=ha441bb4_0
	- tornado=6.0.3=py37h1de35cc_0
	- traitlets=4.3.2=py37_0
	- urllib3=1.24.2=py37_0
	- wcwidth=0.1.7=py37_0
	- webencodings=0.5.1=py37_1
	- wheel=0.33.6=py37_0
	- wrapt=1.11.2=py37h1de35cc_0
	- wurlitzer=1.0.3=py37_0
	- xlrd=1.2.0=py37_0
	- xz=5.2.4=h1de35cc_4
	- zeromq=4.3.1=h0a44026_3
	- zlib=1.2.11=h1de35cc_3
	- pip:
	- beautifulsoup4==4.8.1
	- cx-freeze==6.0
	- et-xmlfile==1.0.1
	- google==2.0.2
	- google-search==1.0.2
	- jdcal==1.4.1
	- lxml==4.4.1
	- openpyxl==3.0.0
	- selenium==3.141.0
	- soupsieve==1.9.4
	# -- coding: utf-8 --
	#Author : Faycal B
	#Last change 06/11/2019
	#Requirements :
	#bs4 (To parse Html file)
	#pandas (use of DataFrames and writing to an Excel file)
	#requests (Html client)

	"""
	Spyder Editor

	Welcome to the extract Facebook Info Script :)
	This script take your facebook link from a csv in input and output a csv file with all infos
	"""

	from bs4 import BeautifulSoup
	import requests
	import re #to extract numbers from a string
	import pandas as pd



	#access to the website
	df1 = pd.read_csv("FbRemains.csv")
	#extract only the column Weblink
	df2 = df1["Weblink"]
	#Create a dataFrame
	df3 = pd.DataFrame( columns=['WebLink',"Last Name", "Company name",'Mail',
	'Subject','Followers','Address','Website', "ZipCode","State"
	,"City", "About"])

	#increments the number of scrap results
	k=0
	for i in df2:
	print(i)
	#create a dictionnary of images from facebook
	Imgs = {"Follower":"https://static.xx.fbcdn.net/rsrc.php/v3/y7/r/PL1sMLehMAU.png",
	"Subject":"https://static.xx.fbcdn.net/rsrc.php/v3/yl/r/LwDWwC1d0Rx.png",
	"Phone":"https://static.xx.fbcdn.net/rsrc.php/v3/yW/r/mYv88EsODOI.png",
	"Website":"https://static.xx.fbcdn.net/rsrc.php/v3/yx/r/xVA3lB-GVep.png",
	"Address":"https://static.xx.fbcdn.net/rsrc.php/v3/y5/r/vfXKA62x4Da.png",
	"Mail":"https://static.xx.fbcdn.net/rsrc.php/v3/yy/r/vKDzW_MdhyP.png" }
	page = i
	url = requests.get(page).text

	# parse the facebook page
	soup = BeautifulSoup(url, "lxml")

	# find the class "clearfix _ikh" where reposed all user info that we need
	Clearfix = soup.find_all("div", {"class":"clearfix _ikh"})

	# stock UserInfo in dictionnary
	UserInfo = {}

	#extract company name
	if soup.find("span",{"class":"_50f7"}) != None :
	CompanyName = soup.find("span",{"class":"_50f7"}).text
	else:
	CompanyName = "N.A"


	for s in Clearfix :
	for I in Imgs:
	#Verify that the data match with what we want. Looking through
	if Imgs[I] == s.find("div", {"class":"_4bl7"}).find("img").attrs["src"]:
	UserInfo[I]=(s.find("div", {"class":"_4bl9"}).text)

	#Check if the webpage exist at least one follower we target
	if UserInfo.get("Follower") == None :
	continue
	#prettify Follower data, remove characters
	p = UserInfo["Follower"]
	u = re.sub(r"\s+", "",re.sub(r"[a-z]", "",p))
	UserInfo["Follower"] = u

	if UserInfo.get("Address") != None:

	#prettify Address data
	Address = UserInfo["Address"].split("(")[0]
	ZipCode = UserInfo["Address"].split(")")[1].split(" ")[1]
	City = UserInfo["Address"].split(")")[1].split(" ")[2]
	State = UserInfo["Address"].split(")")[1].split(" ")[3]
	Address2 = {"address":Address, "zipcode": ZipCode, "city":City, "State":State}
	UserInfo["Address"] = Address
	UserInfo["ZipCode"]=ZipCode
	UserInfo["City"] = City
	UserInfo["State"] = State
	else :
	UserInfo["Address"] = "N.A"
	UserInfo["ZipCode"]="N.A"
	UserInfo["City"] = "N.A"
	UserInfo["State"] = "N.A"
	# go to about page
	url2 = requests.get(page+ "about").text
	texte=""
	soup2 = BeautifulSoup(url2, "lxml")

	Mail = soup2.find_all("div",{"class":"_5aj7 _3-8j"})
	for m in Mail:
	#exclude errors None of the result
	if m.find("div",{"class":"_4bl7 _3-90 _a8s"}) != None:
	if Imgs["Mail"]== m.find("div",{"class":"_4bl7 _3-90 _a8s"}).find("img").attrs["src"]:
	UserInfo["Mail"]= m.find("div",{"class":"_4bl9"}).text

	if UserInfo.get("Mail") == None:
	UserInfo["Mail"] = "N.A"
	if UserInfo.get("Website") == None :
	UserInfo["Website"] = "N.A"
	#EXTRACT About info
	Info = soup2.find_all("div",{"class":"_3-8w"})
	for a in Info:
	texte = texte + a.text
	UserInfo["Info"]= texte

	df3.loc[k]= (i, "Scaping", CompanyName, UserInfo["Mail"], UserInfo["Subject"],
	UserInfo["Follower"], UserInfo["Address"], UserInfo["Website"], UserInfo["ZipCode"], UserInfo["State"],
	UserInfo["City"], UserInfo["Info"] )
	k+=1
	#put it into a DataFrame
	#df = pd.DataFrame(UserInfo, index=[0])

	#export to a csv file
	rd= df3.to_csv("InputCRM.csv", index=False)