
1.SELECT columns: SELECT
# Single column
SELECT title
FROM films
df['A']
# multiple columns
SELECT title, release_year
FROM films
df[['A','B']]
# All columns
SELECT *
FROM people
LIMIT 10;
df.head(10)
#unique values from a column
SELECT DISTINCT country
FROM films
df['country'].unique()
#Count the number of rows in column(Notice uncount the NA values:)
SELECT COUNT(*)
FROM people;
df['A'].count()
#Count the number of unique rows in column(Notice uncount the NA values:)
SELECT COUNT(DISTINCT language)
FROM films;
df['A'].nunique()
5
2.Filter rows: WHERE
The following operators can be used in the WHERE clause:
2.1Filter Numeric fields (=,<>,<,>,<=,>=)
SELECT *
FROM films
WHERE budget > 10000;
df[df['budget']>10000]
2.2Filter TEXT fields (Include qoutes for text)
SELECT *
FROM films
WHERE language='French'
df[df['language']=French]
2.3 AND
SELECT title,release_year
FROM films
WHERE release_year < 2000 AND language='Spanish';
df[df['release_year'] < 2000 & df['language']='Spanish']
2.4 OR
SELECT title
FROM films
WHERE (release_year = 1994 OR release_year = 1995)
AND (certification = 'PG' OR certification = 'R');
df[(df['release_year'] = 1994) & (df['certification']='PG')]
2.5 BETWEEN (filtering values within a specified range)
# BETWEEN is inclusive, meaning the beginning and end values are included in the results!
SELECT title,release_year
FROM films
WHERE release_year BETWEEN 1990 AND 2000
2.6 IN (To specify multiple possible values for a column)
SELECT title,release_year
FROM films
WHERE release_year IN (1990,2000)
AND duration>120;
df[df['A'].isin([1, 3, 12, 'a'])]
2.7 IS NULL (check missing value)
WHERE language IS NULL
df.isnull().sum()
2.8 LIKE (Search for a pattern)
Start with 'B': WHERE name LIKE 'B%'
Ends with 'A' : WHERE name LIKE '%A'
Second letter is r: WHERE name LIKE '_r%'
Have "or" in any position: WHERE name LIKE '%r%'
Not start with A: WHERE name NOT LIKE 'A%'
df[df['A'].str.startswith('b', na=False)]
df[df['A'].str.endswith('b', na=False)]
df[df['A'].str.contains("hello")]
df[df['A'].str.contains("Hello|Britain")==True]
3.Aggregate Function
Aggregate Function returns a value
SELECT COUNT(column_name)
SELECT SUM(column_name)
SELECT AVG(column_name)
SELECT MIN(column_name)
SELECT MAX(column_name)
Aliasing
SELECT title, (gross-budget) AS net_profit
FROM films
SELECT COUNT(deathdate)*100.0/COUNT(*) as percentage_dead
FROM people
ORDER BY
Get all details for all films except those released in 2015 and order them by duration.
#Sorting single column
SELECT *
FROM films
WHERE release_year != 2015
ORDER BY duration
#Sorting multiple columns
SELECT birthdate,name
FROM people
ORDER BY birthdate,name
4.GROUP BY
Commonly, GROUP BY is used with aggregate functions like COUNT() or MAX()
#Group by single column
SELECT sex, count(*)
FROM employees
GROUP BY sex;
df.groupby('sex').count()
df['A'].value_counts()
#Group by multiple column
SELECT release_year,AVG(duration)
FROM films
GROUP BY release_year
df.groupby('release_year')['duration'].mean()
#Get the release year, country, and highest budget spent making a film for each year, for each country. Sort your results by release year and country.
SELECT release_year,MAX(budget),country
FROM films
GROUP BY release_year,country
5. Case when and Then
Define a new category
#Transfer the numerical column to category column, then name the new column as geosize_group.
#INTO Clause is used to create a new table
SELECT name, continent, code, surface_area,
CASE
WHEN surface_area >2000000 THEN 'large'
WHEN surface_area >350000 AND surface_area <200000000 THEN 'medium'
ELSE 'small'
END
AS geosize_group
INTO pop_plus
FROM countries
dataset.loc[dataset['Fare']<=7.91,'Fare'] = 0
dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454),'Fare'] = 1
dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
6.HAVING
If you want to filter on aggregate functions like COUNT(),you need to use HAVING clause, WHERE is invalid.
SELECT release_year
FROM films
GROUP BY release_year
HAVING COUNT(title) > 10;
df[df.groupby('release_year')['title'].count()>10]
7.JOINS
If you want to query multiple tables, you need joining tables first
7.1 INNER JOIN
Based on matching values selects rows from both tables.
SELECT c.code AS country_code, year, inflation_rate, c.name
FROM countries AS c
Inner JOIN economies AS e
ON c.code = e.code;
#when the matching column name is same in both table can use USING (coded) as a shortcut
SELECT c.code, name, region, e.year, fertility_rate, unemployment_rate
FROM countries AS c
INNER JOIN populations AS p
ON c.code = p.country_code
INNER JOIN economies AS e
ON c.code = e.code
#Inner join with using
SELECT *
FROM countries
INNER JOIN economies
USING(code)
7.2 lEFT JOIN
Return all the records from the left table and the matched records from the right table
# one to many relationship: one customer match many order ID
SELECT Customers.CustomerName, Orders.OrderID
FROM Customers
LEFT JOIN Orders
ON Customers.CustomerID=Orders.CustomerID
ORDER BY Customers.CustomerName;
7.3 RIGHT JOIN
Return all the records from the right table and the matched records from the left table
# many to one relationship: many OrderID match one employees
SELECT Orders.OrderID, Employees.LastName, Employees.FirstName
FROM Orders
RIGHT JOIN Employees
ON Orders.EmployeeID = Employees.EmployeeID
ORDER BY Orders.OrderID;
7.4 FULL JOIN
SELECT Customers.CustomerName, Orders.OrderID
FROM Customers
FULL OUTER JOIN Orders ON Customers.CustomerID=Orders.CustomerID
ORDER BY Customers.CustomerName;
7.5 CROSS JOIN
SELECT c.name AS city, l.name AS language
FROM cities AS c
CROSS JOIN languages AS l
WHERE c.name LIKE 'Hyder%';
7.6 SELF JOIN
SELECT p1. country_code,
p1.size AS size2010,
p2.size AS size2015, (p2.size-p1.size)/p1.size*100.0 AS growth_perc
FROM populations AS p1
INNER JOIN populations AS P2
ON P1. country_code = P2.country_code
AND p1.year=p2.year-5;
Union
SELECT *
FROM economies2010
UNION
SELECT *
FROM economies2015
8.Subquery
8.1 Subquery inside WHERE
If the subquery is one or multiple value or word
# Calculate the avg life_expectancy
SELECT AVG(life_expectancy)
FROM populations
WHERE year = 2015;
---
# Then get those who live longer 1.15 times than the avg life_expectancy
SELECT *
FROM populations
WHERE life_expectancy > 1.15 * (SELECT AVG(life_expectancy) FROM populations WHERE year = 2015)
AND year = 2015;
8.2 Subquery inside SELECT
SELECT countries.name AS country,
(SELECT COUNT(*)
FROM cities
WHERE countries.code = cities.country_code) AS cities_num
FROM countries
ORDER BY cities_num DESC, country
LIMIT 9;
8.3 Subquery inside FROM
If subquery is one or multiple column then can be regarded as a new table
SELECT name,subquery.count
FROM countries,(SELECT country_code, COUNT(*)
FROM cities
GROUP BY country_code
) AS subquery
WHERE countries.code = subquery.country_code;




近期评论