月落丹枫

1.SELECT columns: SELECT

# Single column
SELECT title
FROM films

df['A']
# multiple columns
SELECT title, release_year
FROM films

df[['A','B']]

# All columns
SELECT *
FROM people
LIMIT 10;

df.head(10)
#unique values from a column
SELECT DISTINCT country
FROM films

df['country'].unique()

#Count the number of rows in column(Notice uncount the NA values:)
SELECT COUNT(*)
FROM people;

df['A'].count()

#Count the number of unique rows in column(Notice uncount the NA values:)
SELECT COUNT(DISTINCT language)
FROM films;

df['A'].nunique()
5

2.Filter rows: WHERE

The following operators can be used in the WHERE clause:

2.1Filter Numeric fields (=,<>,<,>,<=,>=)

SELECT *
FROM films
WHERE budget > 10000;

df[df['budget']>10000]

2.2Filter TEXT fields (Include qoutes for text)

SELECT *
FROM films
WHERE language='French'

df[df['language']=French]

2.3 AND

SELECT title,release_year
FROM films
WHERE release_year < 2000 AND language='Spanish';

df[df['release_year'] < 2000 & df['language']='Spanish']

2.4 OR

SELECT title
FROM films
WHERE (release_year = 1994 OR release_year = 1995)
AND (certification = 'PG' OR certification = 'R');

df[(df['release_year'] = 1994) & (df['certification']='PG')]

2.5 BETWEEN (filtering values within a specified range)

# BETWEEN is inclusive, meaning the beginning and end values are included in the results!
SELECT title,release_year
FROM films
WHERE release_year BETWEEN 1990 AND 2000

2.6 IN (To specify multiple possible values for a column)

SELECT title,release_year
FROM films
WHERE release_year IN (1990,2000)  
AND duration>120;

df[df['A'].isin([1, 3, 12, 'a'])]

2.7 IS NULL (check missing value)

WHERE language IS NULL

df.isnull().sum()

2.8 LIKE (Search for a pattern)

Start with 'B': WHERE name LIKE 'B%'
Ends with 'A' : WHERE name LIKE '%A'
Second letter is r: WHERE name LIKE '_r%'
Have "or" in any position: WHERE name LIKE '%r%'
Not start with A: WHERE name NOT LIKE 'A%'


df[df['A'].str.startswith('b', na=False)]
df[df['A'].str.endswith('b', na=False)]
df[df['A'].str.contains("hello")]
df[df['A'].str.contains("Hello|Britain")==True]

3.Aggregate Function

Aggregate Function returns a value

SELECT COUNT(column_name)
SELECT SUM(column_name)
SELECT AVG(column_name)
SELECT MIN(column_name)
SELECT MAX(column_name)

Aliasing

SELECT title, (gross-budget) AS net_profit
FROM films

SELECT COUNT(deathdate)*100.0/COUNT(*) as percentage_dead
FROM people

ORDER BY

Get all details for all films except those released in 2015 and order them by duration.

#Sorting single column
SELECT *
FROM films
WHERE release_year != 2015
ORDER BY duration

#Sorting multiple columns
SELECT birthdate,name
FROM people
ORDER BY birthdate,name

4.GROUP BY

Commonly, GROUP BY is used with aggregate functions like COUNT() or MAX()

#Group by single column
SELECT sex, count(*)
FROM employees
GROUP BY sex;

df.groupby('sex').count()
df['A'].value_counts()

#Group by multiple column
SELECT release_year,AVG(duration)
FROM films
GROUP BY release_year

df.groupby('release_year')['duration'].mean()

#Get the release year, country, and highest budget spent making a film for each year, for each country. Sort your results by release year and country.
SELECT release_year,MAX(budget),country
FROM films
GROUP BY release_year,country

5. Case when and Then

Define a new category

#Transfer the numerical column to category column, then name the new column as geosize_group.
#INTO Clause is used to create a new table

SELECT name, continent, code, surface_area,
CASE
    WHEN surface_area >2000000 THEN 'large'
    WHEN surface_area >350000 AND surface_area <200000000  THEN  'medium'
    ELSE  'small'
END
AS geosize_group
INTO pop_plus
FROM countries


dataset.loc[dataset['Fare']<=7.91,'Fare']                                   = 0
dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454),'Fare']  = 1
dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
dataset.loc[ dataset['Fare'] > 31, 'Fare']        = 3

6.HAVING

If you want to filter on aggregate functions like COUNT(),you need to use HAVING clause, WHERE is invalid.

SELECT release_year
FROM films
GROUP BY release_year
HAVING COUNT(title) > 10;


df[df.groupby('release_year')['title'].count()>10]

7.JOINS

If you want to query multiple tables, you need joining tables first

7.1 INNER JOIN

Based on matching values selects rows from both tables.

SELECT c.code AS country_code, year, inflation_rate, c.name
FROM countries AS c
Inner JOIN economies AS e
ON c.code = e.code;
#when the matching column name is same in both table can use USING (coded) as a shortcut

SELECT c.code, name, region, e.year, fertility_rate, unemployment_rate
FROM countries AS c
INNER JOIN populations AS p
ON c.code = p.country_code
INNER JOIN economies AS e
ON c.code = e.code

#Inner join with using

SELECT *
FROM countries
INNER JOIN economies
USING(code)

7.2 lEFT JOIN

Return all the records from the left table and the matched records from the right table

# one to many relationship: one customer match many order ID

SELECT Customers.CustomerName, Orders.OrderID
FROM Customers
LEFT JOIN Orders
ON Customers.CustomerID=Orders.CustomerID
ORDER BY Customers.CustomerName;

7.3 RIGHT JOIN

Return all the records from the right table and the matched records from the left table

# many to one relationship: many OrderID match one employees

SELECT Orders.OrderID, Employees.LastName, Employees.FirstName
FROM Orders
RIGHT JOIN Employees
ON Orders.EmployeeID = Employees.EmployeeID
ORDER BY Orders.OrderID;

7.4 FULL JOIN

SELECT Customers.CustomerName, Orders.OrderID
FROM Customers
FULL OUTER JOIN Orders ON Customers.CustomerID=Orders.CustomerID
ORDER BY Customers.CustomerName;

7.5 CROSS JOIN

SELECT c.name AS city, l.name AS language
FROM cities AS c
CROSS JOIN languages AS l
WHERE  c.name LIKE 'Hyder%';

7.6 SELF JOIN

SELECT p1. country_code,
       p1.size AS size2010,
       p2.size AS size2015,  (p2.size-p1.size)/p1.size*100.0 AS growth_perc
FROM populations AS p1
INNER JOIN populations AS P2
ON  P1. country_code = P2.country_code
AND p1.year=p2.year-5;

Union

SELECT *
FROM economies2010
UNION
SELECT  *  
FROM economies2015

8.Subquery

8.1 Subquery inside WHERE

If the subquery is one or multiple value or word

# Calculate the avg life_expectancy
SELECT AVG(life_expectancy)
FROM populations
WHERE year = 2015;

---
# Then get those who live longer 1.15 times than the avg life_expectancy
SELECT *
FROM populations
WHERE life_expectancy > 1.15 * (SELECT AVG(life_expectancy) FROM populations WHERE year = 2015)
AND year = 2015;

8.2 Subquery inside SELECT

SELECT countries.name AS country,
  (SELECT COUNT(*)
   FROM cities
   WHERE countries.code = cities.country_code) AS cities_num
FROM countries
ORDER BY cities_num DESC, country
LIMIT 9;

8.3 Subquery inside FROM

If subquery is one or multiple column then can be regarded as a new table

SELECT name,subquery.count
FROM countries,(SELECT country_code, COUNT(*)
FROM cities
GROUP BY country_code
) AS subquery
WHERE countries.code = subquery.country_code;