Base on DataCamp.
Write function
Functions that Return Single Values
# Define shout with the parameter, word
def shout(word):
"""Return a string with three exclamation marks"""
# Concatenate the strings: shout_word
shout_word = word + '!!!'
# Replace print with return
return(shout_word)
# 如果不用return,而是直接在方程里print返回结果会是NoneType
# Pass 'congratulations' to shout: yell
yell = shout('congratulations')
# Print yell
print(yell)
> congratulations!!!
Multiple Parameters and Return Values
# A brief introduction to tuples(immutable!!)
nums = (3, 4, 6)
## Unpack nums into num1, num2, and num3
num1, num2, num3 = nums
## Construct even_nums
even_nums=(2,4,6)
# Define shout_all with parameters word1 and word2
def shout_all(word1, word2):
# Concatenate word1 with '!!!': shout1
shout1 = word1 + '!!!'
# Concatenate word2 with '!!!': shout2
shout2 = word2 + '!!!'
# Construct a tuple with shout1 and shout2: shout_words
shout_words = (shout1,shout2)
# Return shout_words
return shout_words
# Pass 'congratulations' and 'you' to shout_all(): yell1, yell2
yell1,yell2=shout_all('congratulations','you')
# Print yell1 and yell2
print(yell1)
print(yell2)
> congratulations!!!
> you!!!
Twitter Dataframe Analysis
The dataset contains Twitter data and you will iterate over entries in a column to build a dictionary in which the keys are the names of languages and the values are the number of tweets in the given language.
# Import pandas
import pandas as pd
# Import Twitter data as DataFrame: df
df = pd.read_csv('tweets.csv')
# Define count_entries()
def count_entries(df, col_name):
"""Return a dictionary with counts of
occurrences as value for each key."""
# Initialize an empty dictionary: langs_count
langs_count = {}
# Extract column from DataFrame: col
col = df[col_name]
# Iterate over lang column in DataFrame
for entry in col:
# If the language is in langs_count, add 1
if entry in langs_count.keys():
langs_count[entry]+=1
# Else add the language to langs_count, set the value to 1
else:
langs_count[entry]=1
# Return the langs_count dictionary
return(langs_count)
# Call count_entries(): result
result = count_entries(tweets_df,'lang')
# Print the result
print(result)
> {'en': 97, 'et': 1, 'und': 2}
Scope
- LEGB rule,L for local, E for enclosing functions,G for global and B for built-in.
# Assigning name only create or change local names
num = 5
def func1():
num = 3
print(num)
func1()
> 3
print(num)
> 5
def func2():
global num
double_num = num * 2
num = 6
print(double_num)
func2()
> 10
print(num)
> 6
# Create a string: team
team = "teen titans"
# Define change_team()
def change_team():
"""Change the value of the global variable team."""
# Use team in global scope
global team
# Change the value of team in global: team
team = "justice league"
# Print team
print(team)
> teen titans
# Call change_team()
change_team()
# Print team
print(team)
> justice league
Nested functions
# Define three_shouts
def three_shouts(word1, word2, word3):
"""Returns a tuple of strings
concatenated with '!!!'."""
# Define inner
def inner(word):
"""Returns a string concatenated with '!!!'."""
return word + '!!!'
# Return a tuple of strings!!
return (inner(word1), inner(word2), inner(word3))
# Call three_shouts() and print
print(three_shouts('a', 'b', 'c'))
> ('a!!!', 'b!!!', 'c!!!')
# Define echo
def echo(n):
"""Return the inner_echo function."""
# Define inner_echo
def inner_echo(word1):
"""Concatenate n copies of word1."""
echo_word = word1 * n
return echo_word
# Return inner_echo
return inner_echo
# Call echo: twice
twice = echo(2)
# Call echo: thrice
thrice = echo(3)
# Call twice() and thrice() then print
print(twice('hello'), thrice('hello'))
> hellohello hellohellohello
# Define echo_shout()
def echo_shout(word):
"""Change the value of a nonlocal variable"""
# Concatenate word with itself: echo_word
echo_word = word*2
# Print echo_word
print(echo_word)
# Define inner function shout()
def shout():
"""Alter a variable in the enclosing scope"""
# Use echo_word in nonlocal scope
nonlocal echo_word
# Change echo_word to echo_word concatenated with '!!!'
echo_word = echo_word+ '!!!'
# Call function shout()
shout()
# Print echo_word
print(echo_word)
# Call function echo_shout() with argument 'hello'
echo_shout('hello')
>> hellohello
hellohello!!!
Default and Flexible Arguments
Add a default argument
# Define shout_echo
def power(number, pow=1):
"""Raise number to the power of pow."""
new_value = number ** pow
return new_value
power(9, 2)
> 81
power(9, 1)
> 9
power(9)
> 0
Functions with variable-length arguments (*args)
# args is a tuple!!
# 表示函数接收可变长度的非关键字参数列表作为函数的输入
def add_all(*args):
"""Sum all values in *args together."""
# Initialize sum
sum_all = 0
# Accumulate the sum
for num in args:
sum_all += num
return sum_all
add_all(1)
> 1
add_all(1, 2)
> 3
add_all(5, 10, 15, 20)
> 50
Functions with variable-length keyword arguments (**kwargs)
# kwargs is a dictionary!!
# 表示函数接收可变长度的关键字参数字典作为函数的输入
def print_all(**kwargs):
"""Print out key-value pairs in **kwargs."""
# Print out the key-value pairs
for key, value in kwargs.items():
print(key + \": \" + value)
print_all(name="dumbledore",
job="headmaster")
> job: headmaster
name: dumbledore
Lambda Functions
- Map() and lambda functions
# Function map takes two arguments: map(func, seq)
# map() applies the function to ALL elements in the sequence
nums = [48, 6, 9, 21, 1]
square_all = map(lambda num: num ** 2, nums)
print(square_all)
> <map object at 0x103e065c0>
print(list(square_all))
> [2304, 36, 81, 441, 1]
- Filter() and lambda functions
# filter() offers a way to filter out elements from a list that don't satisfy certain criteria.
# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'pippin', 'aragorn', 'boromir', 'legolas', 'gimli', 'gandalf']
# Use filter() to apply a lambda function over fellowship: result
result = filter(lambda member:len(member)>6 , fellowship)
# Convert result to a list: result_list
result_list=list(result)
# Print result_list
print(result_list)
> ['samwise', 'aragorn', 'boromir', 'legolas', 'gandalf']
- Reduce() and lambda functions
# Import reduce from functools
from functools import reduce
# Create a list of strings: stark
stark = ['robb', 'sansa', 'arya', 'brandon', 'rickon']
# Use reduce() to apply a lambda function over stark: result
result = reduce(lambda item1,item2: item1+item2, stark)
# Print the result
print(result)
> robbsansaaryabrandonrickon
Error Handling
try-except
def sqrt(x):
"""Returns the square root of a number."""
try:
return x ** 0.5
except:
print('x must be an int or float')
sqrt(10.0)
> 3.1622776601683795
sqrt('hi')
> x must be an int or float
raising an error
def sqrt(x):
"""Returns the square root of a number."""
if x < 0:
raise ValueError('x must be non-negative')
try:
return x ** 0.5
except TypeError:
print('x must be an int or float')
Iterators
Iterators vs Iterables
An iterable is an object that can return an iterator, while an iterator is an object that keeps state and produces the next value when you call next()
- Iterable
- Examples: lists, strings, dictionaries, le connections
- An object with an associated iter() method
- Applying iter() to an iterable creates an iterator
- Iterator
- Produces next value with next()
# Iterating over iterables: next()
word = 'Da'
it = iter(word)
next(it)
> 'D'
next(it)
> 'a'
# Iterating at once with *
word = 'Data'
it = iter(word)
print(*it)
> D a t a
# Iterating over dictionaries
pythonistas = {'hugo': 'bowne-anderson'
,
'francis': 'castro'}
for key, value in pythonistas.items():
print(key, value)
> francis castro
hugo bowne-anderson
# Iterating over le connections
file = open('file.txt')
it = iter(file)
print(next(it))
> This is the first line.
print(next(it))
> This is the second line.
enumerate()
enumerate() returns an enumerate object that produces a sequence of tuples, and each of the tuples is an index-value pair.
# Using enumerate()
avengers = ['hawkeye', 'iron man', 'thor', 'quicksilver']
e = enumerate(avengers)
print(type(e))
> <class 'enumerate'>
e_list = list(e)
print(e_list)
> [(0, 'hawkeye'), (1, 'iron man'), (2, 'thor'), (3, 'quicksilver')]
# enumerate() and unpack
avengers = ['hawkeye', 'iron man', 'thor', 'quicksilver']
for index, value in enumerate(avengers):
print(index, value)
> 0 hawkeye
1 iron man
2 thor
3 quicksilver
for index, value in enumerate(avengers, start=10):
print(index, value)
> 10 hawkeye
11 iron man
12 thor
13 quicksilver
zip()
# Using zip()
avengers = ['hawkeye', 'iron man', 'thor', 'quicksilver']
names = ['barton', 'stark', 'odinson', 'maximoff']
z = zip(avengers, names)
print(type(z))
> <class 'zip'>
z_list = list(z)
print(z_list)
> [('hawkeye', 'barton'), ('iron man', 'stark'),
('thor', 'odinson'), ('quicksilver', 'maximoff')]
# zip() and unpack
avengers = ['hawkeye', 'iron man', 'thor', 'quicksilver']
names = ['barton', 'stark', 'odinson', 'maximoff']
for z1, z2 in zip(avengers, names):
print(z1, z2)
> hawkeye barton
iron man stark
thor odinson
quicksilver maximoff
# Print zip with *
avengers = ['hawkeye', 'iron man', 'thor', 'quicksilver']
names = ['barton', 'stark', 'odinson', 'maximoff']
z = zip(avengers, names)
print(*z)
> ('hawkeye', 'barton') ('iron man', 'stark')('thor', 'odinson') ('quicksilver', 'maximoff')
Extracting information for large amounts of Twitter data
# Define count_entries()
def count_entries(csv_file,c_size,colname):
"""Return a dictionary with counts of
occurrences as value for each key."""
# Initialize an empty dictionary: counts_dict
counts_dict = {}
# Iterate over the file chunk by chunk
for chunk in pd.read_csv(csv_file,chunksize=c_size):
# Iterate over the column in DataFrame
for entry in chunk[colname]:
if entry in counts_dict.keys():
counts_dict[entry] += 1
else:
counts_dict[entry] = 1
# Return counts_dict
return counts_dict
# Call count_entries(): result_counts
result_counts = count_entries('tweets.csv',10,'lang')
# Print result_counts
print(result_counts)
List comprehensions
- Basic
[output expression for iterator variable in iterable]
- Advanced
[output expression + conditional on output for iterator variable in iterable + conditional on iterable]
Nested list comprehensions
# Create list comprehension: squares
squares = [i**2 for i in range(10)]
print(squares)
> [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
# Nested list comprehensions
[[output expression] for iterator variable in iterable]
# Create a 5 x 5 matrix using a list of lists: matrix
matrix = [[col for col in range(5)] for row in range(5)]
# Print the matrix
for row in matrix:
print(row)
> [0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
Using conditionals in comprehensions
# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']
# Create list comprehension with if:
new_fellowship = [member for member in fellowship if len(member) >= 7]
# Print the new list
print(new_fellowship)
> ['samwise', 'aragorn', 'legolas', 'boromir']
# Create list comprehension with if-else:
new_fellowship = [member if len(member)>=7 else '' for member in fellowship]
# Print the new list
print(new_fellowship)
> ['', 'samwise', '', 'aragorn', 'legolas', 'boromir', '']
Dict comprehensions
# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']
# Create dict comprehension: new_fellowship
new_fellowship = {member:len(member) for member in fellowship}
# Print the new dictionary
print(new_fellowship)
> {'frodo': 5, 'samwise': 7, 'merry': 5, 'aragorn': 7, 'legolas': 7, 'boromir': 7, 'gimli': 5}
generator expressions
Efficient Pandas: Using Chunksize for Large Data Sets
# Create generator object: result
result = (num for num in range(31))
# Print the first 5 values
print(next(result))
print(next(result))
print(next(result))
print(next(result))
print(next(result))
# Print the rest of the values
for value in result:
print(value)
- Changing the output in generator expressions
# Create a list of strings: lannister
lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']
# Create a generator object: lengths
lengths = (len(person) for person in lannister)
# Iterate over and print the values in lengths
for value in lengths:
print(value)
- Build a generator
# Create a list of strings
lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']
# Define generator function get_lengths
def get_lengths(input_list):
"""Generator function that yields the
length of the strings in input_list."""
# Yield the length of a string
for person in input_list:
yield len(person)
# Print the values generated by get_lengths()
for value in get_lengths(lannister):
print(value)
- Wrapping up comprehensions and generators.
# Extract the created_at column from df: tweet_time
tweet_time = df['created_at']
# Extract the clock time: tweet_clock_time.(Access the 12th to 19th characters in the string to extract the time in which entry[17:19] is equal to '19')
tweet_clock_time = [entry[11:19] for entry in tweet_time if entry[17:19] == '19']
# Print the extracted times
print(tweet_clock_time)
Case study
Dataset: World Bank World Development Indicators
Warm up
- Dictionaries for data science
# Zip lists: zipped_lists
zipped_lists = zip(feature_names,row_vals)
# Create a dictionary: rs_dict
rs_dict = dict(zipped_lists)
# Print the dictionary
print(rs_dict)
> {'CountryName': 'Arab World', 'CountryCode': 'ARB', 'IndicatorName': 'Adolescent fertility rate (births per 1,000 women ages 15-19)', 'IndicatorCode': 'SP.ADO.TFRT', 'Year': '1960', 'Value': '133.56090740552298'}
- Writing a function to help
# Define lists2dict()
def lists2dict(list1, list2):
"""Return a dictionary where list1 provides
the keys and list2 provides the values."""
# Zip lists: zipped_lists
zipped_lists = zip(list1, list2)
# Create a dictionary: rs_dict
rs_dict = dict(zipped_lists)
# Return the dictionary
return dict(rs_dict)
# Call lists2dict: rs_fxn
rs_fxn = lists2dict(feature_names, row_vals)
# Print rs_fxn
print(rs_fxn)
> {'CountryName': 'Arab World', 'CountryCode': 'ARB', 'IndicatorName': 'Adolescent fertility rate (births per 1,000 women ages 15-19)', 'IndicatorCode': 'SP.ADO.TFRT', 'Year': '1960', 'Value': '133.56090740552298'}
- Using a list comprehension
# Print the first two lists in row_lists
print(row_lists[0])
print(row_lists[1])
# Turn list of lists into list of dicts: list_of_dicts
list_of_dicts = [lists2dict(feature_names,sublist) for sublist in row_lists]
# Print the first two dictionaries in list_of_dicts
print(list_of_dicts[0])
print(list_of_dicts[1])
- Turning this all into a DataFrame
# Import the pandas package
import pandas as pd
# Turn list of lists into list of dicts: list_of_dicts
list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists]
# Turn list of dicts into a DataFrame: df
df = pd.DataFrame(list_of_dicts)
# Print the head of the DataFrame
print(df.head())
> print(df.head())
CountryCode CountryName ... Value Year
0 ARB Arab World ... 133.56090740552298 1960
1 ARB Arab World ... 87.7976011532547 1960
2 ARB Arab World ... 6.634579191565161 1960
3 ARB Arab World ... 81.02332950839141 1960
4 ARB Arab World ... 3000000.0 1960
[5 rows x 6 columns]
Python generators for streaming data
- Processing data in chunks
# Open a connection to the file
with open('world_dev_ind.csv') as file:
# Skip the column names
file.readline()
# Initialize an empty dictionary: counts_dict
counts_dict = {}
# Process only the first 1000 rows
for j in range(0, 1000):
# Split the current line into a list: line
line = file.readline().split(',')
# Get the value for the first column: first_col
first_col = line[0]
# If the column value is in the dict, increment its value
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
# Else, add to the dict and set value to 1
else:
counts_dict[first_col] = 1
# Print the resulting dictionary
print(counts_dict)
> {'Arab World': 80, 'Caribbean small states': 77, 'Central Europe and the Baltics': 71, 'East Asia & Pacific (all income levels)': 122, 'East Asia & Pacific (developing only)': 123, 'Euro area': 119, 'Europe & Central Asia (all income levels)': 109, 'Europe & Central Asia (developing only)': 89, 'European Union': 116, 'Fragile and conflict affected situations': 76, 'Heavily indebted poor countries (HIPC)': 18}
- Writing a generator to load data in chunks
# Define read_large_file()
def read_large_file(file_object):
"""A generator function to read a large file lazily."""
# Loop indefinitely until the end of the file
while True:
# Read a line from the file: data
data = file_object.readline()
# Break if this is the end of the file
if not data:
break
# Yield the line of data
yield data
# Open a connection to the file
with open('world_dev_ind.csv') as file:
# Create a generator object for the file: gen_file
gen_file = read_large_file(file)
# Print the first three lines of the file
print(next(gen_file))
print(next(gen_file))
> CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
> Arab World,ARB,"Adolescent fertility rate (births per 1,000 women ages 15-19)",SP.ADO.TFRT,1960,133.56090740552298
# Initialize an empty dictionary: counts_dict
counts_dict = {}
# Open a connection to the file
with open('world_dev_ind.csv') as file:
# Iterate over the generator from read_large_file()
for line in read_large_file(file):
row = line.split(',')
first_col = row[0]
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
else:
counts_dict[first_col] = 1
# Print
print(counts_dict)
Writing an iterator to load data in chunks
# Define plot_pop()
def plot_pop(filename, country_code):
# Initialize reader object: urb_pop_reader
urb_pop_reader = pd.read_csv(filename, chunksize=1000)
# Initialize empty DataFrame: data
data = pd.DataFrame()
# Iterate over each DataFrame chunk
for df_urb_pop in urb_pop_reader:
# Check out specific country: df_pop_ceb
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code]
# Zip DataFrame columns of interest: pops
pops = zip(df_pop_ceb['Total Population'],
df_pop_ceb['Urban population (% of total)'])
# Turn zip object into list: pops_list
pops_list = list(pops)
# Use list comprehension to create new DataFrame column 'Total Urban Population'
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list]
# Append DataFrame chunk to data: data
data = data.append(df_pop_ceb)
# Plot urban population data
data.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()
# Set the filename: fn
fn = 'ind_pop_data.csv'
# Call plot_pop for country code 'CEB'
plot_pop('ind_pop_data.csv','CEB')
# Call plot_pop for country code 'ARB'
plot_pop('ind_pop_data.csv','ARB')