import requests


# use the get function
page = requests.get("https://www.w3schools.com/howto/tryhow_make_a_website.htm")


# print the result 
# if you want to understand the output better go to:
# https://en.wikipedia.org/wiki/List_of_HTTP_status_codes#2xx_success
page
# see the type of object that we received
type(page)
# to get the data from the page itself we use the .text attribute
page.text
# this object also maintains the source of the page
page.url

'https://www.w3schools.com/howto/tryhow_make_a_website.htm'


print(page.text)

<!DOCTYPE html>
<html lang="en">
<head>
<title>Page Title</title>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<style>
* {
  box-sizing: border-box;
}

/* Style the body */
body {
  font-family: Arial, Helvetica, sans-serif;
  margin: 0;
}

/* Header/logo Title */
.header {
  padding: 80px;
  text-align: center;
  background: #1abc9c;
  color: white;
}

/* Increase the font size of the heading */
.header h1 {
  font-size: 40px;
}

/* Style the top navigation bar */
.navbar {
  overflow: hidden;
  background-color: #333;
}

/* Style the navigation bar links */
.navbar a {
  float: left;
  display: block;
  color: white;
  text-align: center;
  padding: 14px 20px;
  text-decoration: none;
}

/* Right-aligned link */
.navbar a.right {
  float: right;
}

/* Change color on hover */
.navbar a:hover {
  background-color: #ddd;
  color: black;
}

/* Column container */
.row {  
  display: -ms-flexbox; /* IE10 */
  display: flex;
  -ms-flex-wrap: wrap; /* IE10 */
  flex-wrap: wrap;
}

/* Create two unequal columns that sits next to each other */
/* Sidebar/left column */
.side {
  -ms-flex: 30%; /* IE10 */
  flex: 30%;
  background-color: #f1f1f1;
  padding: 20px;
}

/* Main column */
.main {   
  -ms-flex: 70%; /* IE10 */
  flex: 70%;
  background-color: white;
  padding: 20px;
}

/* Fake image, just for this example */
.fakeimg {
  background-color: #aaa;
  width: 100%;
  padding: 20px;
}

/* Footer */
.footer {
  padding: 20px;
  text-align: center;
  background: #ddd;
}

/* Responsive layout - when the screen is less than 700px wide, make the two columns stack on top of each other instead of next to each other */
@media screen and (max-width: 700px) {
  .row {   
    flex-direction: column;
  }
}

/* Responsive layout - when the screen is less than 400px wide, make the navigation links stack on top of each other instead of next to each other */
@media screen and (max-width: 400px) {
  .navbar a {
    float: none;
    width: 100%;
  }
}
</style>
</head>
<body>

<div class="header">
  <h1>My Website</h1>
  <p>A website created by me.</p>
</div>

<div class="navbar">
  <a href="#">Link</a>
  <a href="#">Link</a>
  <a href="#">Link</a>
  <a href="#" class="right">Link</a>
</div>

<div class="row">
  <div class="side">
    <h2>About Me</h2>
    <h5>Photo of me:</h5>
    <div class="fakeimg" style="height:200px;">Image</div>
    <p>Some text about me in culpa qui officia deserunt mollit anim..</p>
    <h3>More Text</h3>
    <p>Lorem ipsum dolor sit ame.</p>
    <div class="fakeimg" style="height:60px;">Image</div><br>
    <div class="fakeimg" style="height:60px;">Image</div><br>
    <div class="fakeimg" style="height:60px;">Image</div>
  </div>
  <div class="main">
    <h2>TITLE HEADING</h2>
    <h5>Title description, Dec 7, 2017</h5>
    <div class="fakeimg" style="height:200px;">Image</div>
    <p>Some text..</p>
    <p>Sunt in culpa qui officia deserunt mollit anim id est laborum consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco.</p>
    <br>
    <h2>TITLE HEADING</h2>
    <h5>Title description, Sep 2, 2017</h5>
    <div class="fakeimg" style="height:200px;">Image</div>
    <p>Some text..</p>
    <p>Sunt in culpa qui officia deserunt mollit anim id est laborum consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco.</p>
  </div>
</div>

<div class="footer">
  <h2>Footer</h2>
</div>

</body>
</html>


from bs4 import BeautifulSoup as bs


# apply BeatifulSoup as a function
soup = bs(page.text)
# a BeatifulSoup object
type(soup)

bs4.BeautifulSoup


# body is the tag where the content is ususally located.
soup.body
# this is shorthand for the find() function
soup.body == soup.find("body") 
# we can go deeper
soup.body.div
# this gives the first tag that it finds with that name
# it does not have to be a child
soup.body.h1 == soup.body.div.h1
# a useful terminology refers to the parent, sibling and child
soup.body.div == soup.body.div.h1.parent
soup.body.div.h1.next_sibling # it may not be exactly what we expect
soup.body.div.h1.next_sibling.previous_sibling == soup.body.div.h1
soup.body.div.contents
soup.body.div.next_siblings
# generator objects work like lists
# for element in soup.body.div.next_siblings:
#     print(element)
# when we need more than just first we will probably use find_all()
# and we can go deep with this
soup.body.find_all("div")[2].div.find_all("div")

[<div class="fakeimg" style="height:200px;">Image</div>,
 <div class="fakeimg" style="height:60px;">Image</div>,
 <div class="fakeimg" style="height:60px;">Image</div>,
 <div class="fakeimg" style="height:60px;">Image</div>]


# here we used named attributes
soup.find(class_="header")
soup.find_all(charset="UTF-8")
# not everything works
print(soup.meta.find(name="viewport"))

None


# this gives us the text within the tag
soup.title.string
# note that this is not a string even if usually it behaves like one
type(soup.title.string)
# the text attribute or get_text() function gives a string type
soup.title.text
type(soup.title.text)
soup.title.text==soup.title.get_text()
# we can access the name of the tag
soup.title.name

# we can get the values of the attributes
soup.meta.string #this does not return UTF-8
# we access the element using a dictionary-like syntax
soup.meta["charset"]

'UTF-8'


import csv


# we specify the url that we are interested in
url = "https://ideas.repec.org"

# we use the url to get the page that we are interested in
page = requests.get(url+"/i/ex.html").text

# we use the Beautiful Soup library to on this page
soup = bs(page)


# some trial and error:
soup.body
# print(soup.body.prettify())
soup.body
soup.body.a
soup.body.td
soup.body.tr
soup.body.table

soup.body.table.a # this looks like what we need
person = soup.body.table.a

# take the name
name = person.string # we get the string
lastName, firstName = name.split(",") # we split first and last name
lastName = lastName.strip() # we get rid of spaces at beginning and end
firstName = firstName.strip()

# take the link for the personal info
link = person["href"]
    
# take the number of papers 
# we could have used regular expressions for this
papers = person.parent.text.split("(")[1]
papers = papers[:papers.find(")")]


(firstName, lastName, link, papers)

('Angels', 'Xabadia', '/e/pxa5.html', '15')


# puts all the a tags within the table in a list
people = soup.body.table.find_all("a")
# initialise the list where all the data will go
data = []
for person in people:
    # take the name
    name = person.string # we get the string
    lastName, firstName = name.split(",") # we split first and last name
    lastName = lastName.strip() # we get rid of spaces at beginning and end
    firstName = firstName.strip()

    # take the link for the personal info
    link = person["href"]

    # take the number of papers 
    # we could have used regular expressions for this
    papers = person.parent.text.split("(")[1]
    papers = papers[:papers.find(")")] 
    
    # append the data to our list
    data.append((firstName, lastName, link, papers))


data[:5]

[('Angels', 'Xabadia', '/e/pxa5.html', '15'),
 ('Ru', 'Xie', '/e/pxi99.html', '1'),
 ('Ke-Li', 'Xu', '/e/pxu37.html', '14'),
 ('Duarte', 'Xara-Brasil', '/e/pxa4.html', '1'),
 ('Taojun', 'Xie', '/e/pxi120.html', '4')]


# we get the page for one person
personPage = requests.get(url+data[0][2]).text
# we make our soup for this person
personSoup = bs(personPage)

personSoup
personSoup.body
personSoup.body.ul
personSoup.body.find(id="affiliation")
personSoup.body.find(id="affiliation").h3

personSoup.body.find(id="affiliation").h3.contents

[" Departament d'Economia",
 <br/>,
 'Facultat de Ciències Econòmiques i Empresarials',
 <br/>,
 'Universitat de Girona']


# we want to avoid downloading all the avialable data because 
# we do not want to disturb the servers. Also data on 10 individuals 
# is enough for our purposes
finalData=[] # this list will store the final data that we want
for personInfo in data[:10]:
    # now for each person we load a different page
    personURL = personInfo[2]
    personPage = requests.get(url+personURL).text
    # we make a soup for each person
    personSoup = bs(personPage)
    
    # we navigate to the information that we want and we pick
    # what we are interested in
    affiliation = personSoup.find(id="affiliation").h3.contents
    # the following line we do not really need but it is a way of getting rid of the 
    # elements of the list that are not proper elements (this works for this case)
    affiliation = [element.string for element in affiliation if element.string!=None]
    
    # append to our list of data
    finalData.append((personInfo[0], personInfo[1], personInfo[3], affiliation[-1]))


finalData

[('Angels', 'Xabadia', '15', 'Universitat de Girona'),
 ('Ru', 'Xie', '1', 'Bangor University'),
 ('Ke-Li', 'Xu', '14', 'Texas A&M University'),
 ('Duarte', 'Xara-Brasil', '1', 'Instituto Politécnico de Setúbal'),
 ('Taojun', 'Xie', '4', 'Singapore Management University'),
 ('Kuan', 'Xu', '39', 'Dalhousie University'),
 ('Ana', 'Xavier', '14', 'European Commission'),
 ('Wei', 'Xie', '6', 'Peking University'),
 ('Lei', 'Xu', '1', ' Toulouse School of Economics (TSE)'),
 ('Emanuel', 'Xavier-Oliveira', '6', 'Michigan Technological University')]


with open("data.csv", "w", newline="") as csvFile:
    writer = csv.writer(csvFile, delimiter= ";")
    writer.writerow(["first name", "last name", "number of papers", "affiliation"])
    for row in finalData:
        writer.writerow(row)


import pandas as pd


# save the table
rows = soup.body.table.find_all("tr")

# initialise data
frameData = []

for row in rows: # go through all the rows
    cells = row.find_all("td") # cell in each row (one for each column)
    # temp = [cell.string() for cell in cells] # string gives none
    try:
        temp = [cell.a.get_text().strip() for cell in cells]
        frameData.append(temp)
    except:
        pass
    
df = pd.DataFrame(frameData)
# here we just have a list of names so the pandas data frame is not that useful
# in other cases the different columns of the table may contain different variables,
# these names you could also extract from the table and then put them in DataFrame()
# then you can add the names with pandas so that you can later reference them easier:
# df = pd.DataFrame(frameData, columns=some_list_of_names)


df
# for example
df.iloc[1,2]

'Xu, Kuan'

df

	0	1	2
0	Xabadia, Angels	Xie, Ru	Xu, Ke-Li
1	Xara-Brasil, Duarte	Xie, Taojun	Xu, Kuan
2	Xavier, Ana	Xie, Wei	Xu, Lei
3	Xavier-Oliveira, Emanuel	Xie, Xin	Xu, Lihe
4	Xefteris, Dimitrios	Xie, Yinxi	Xu, Lin
5	Xekalaki, Evdokia	Xie, Yu	Xu, Lin
6	Xenidis, Yiannis	Xie, Yuanyuan	Xu, Liujing
7	Xenogiani, Theodora	Xie, Zoe	Xu, Lixin Colin
8	Xepapadeas, Anastasios	Xifre, Ramon	Xu, Mingxin
9	Xesfingi, Sofia	Ximenez-de-Embun, Domingo Perez	Xu, Minya
10	Xia, Fan Dora	Xin, Baogui	Xu, Nan
11	Xia, Jun	Xin, Guangyi	Xu, Ning
12	Xia, Qingjie	Xin, Katherine	Xu, Qiuhua
13	Xia, Tian	Xin, Xian	Xu, Rong
14	Xia, Weixuan	Xing, Chunbing	Xu, Shaofeng
15	Xia, Xiao-Hua	Xing, Victor	Xu, TengTeng
16	Xian, Hui	Xing, Weibo	Xu, Tong
17	Xiang, Chong	Xing, Yuhang	Xu, Wei
18	Xiang, Guocheng	Xing, Yuqing	Xu, Weineng
19	Xiang, Jun	Xinshuo, Hou	Xu, Wenli
20	Xiang, Shuwen	Xiong, Bo	Xu, Xian
21	Xiang, Tao	Xiong, Chenfeng	Xu, Xiaonian
22	Xiang, Yi	Xiong, Hang	Xu, Xing
23	Xiao, Erte	Xiong, Siyang	Xu, Xinpeng
24	Xiao, Jasmine	Xiong, Wei	Xu, Xinzhong
25	Xiao, Jing	Xiong, Zequan	Xu, Yahua
26	Xiao, Jingliang	Xiouros, Costas	Xu, Yang
27	Xiao, Jun	Xiu, Dacheng	Xu, Yi Daniel
28	Xiao, Junji	Xu, Bin	Xu, Yilan
29	Xiao, Kezhou	Xu, Bing	Xu, Yilong
30	Xiao, Qin	Xu, Bing	Xu, Ying
31	Xiao, Saizi	Xu, Bing	Xu, Ying
32	Xiao, Tim	Xu, Changqing	Xu, Yingfeng
33	Xiao, Wei	Xu, Chenggang	Xu, Yizhi
34	Xiao, Wei	Xu, Dingbo	Xu, Yongdeng
35	Xiao, Yan	Xu, Dinghai	Xu, Yongsheng
36	Xiao, Yan Fei	Xu, Elvis Cheng	Xu, Yu
37	Xiao, Ying	Xu, Fang	Xu, Yuanwei
38	Xiao, Yu	Xu, Fangya	Xu, Zeyu
39	Xiao, Zhiguo	Xu, Guo	Xu, Zhengchuan
40	Xiao, Zhijie	Xu, Haiqing	Xu, Zhenhui
41	Xiaojun, Zhao	Xu, Hangtian	Xu, Zhicheng Phil
42	Xiarchos, Irene	Xu, Hao	Xu, Zhiwei
43	Xie, Chaoping	Xu, Haofeng	Xu, Zhiying
44	Xie, Danxia Daniel	Xu, Heng	Xu, Zhun
45	Xie, Danyang	Xu, Jiahua	Xue, Jianpo
46	Xie, Erhao	Xu, Jiajun	Xue, Lian
47	Xie, Fangzhou	Xu, Jianhuan	Xue, Licun
48	Xie, Fei	Xu, Jianwei	Xue, Mei
49	Xie, Feixue	Xu, Jiawen	Xue, Melanie Meng
50	Xie, Huan	Xu, Juanyi	Xue, Yi
51	Xie, Jun	Xu, Jun	Xue, Yuhan
52	Xie, Jun	Xu, Junyi	Xue, Yunkui
53	Xie, Kuangli	Xu, Kai	Xyngis, Georgios
54	Xie, Li	Xu, Ke	None

Import Libraries¶

Requests¶

Detour to HTML¶

Beautiful Soup¶

CSV¶

Web Scraping in practise¶

Pick data from different links¶

Saving the data¶

Thank you for participating!¶

Somthing Extra¶

Scrape Table to Pandas¶