Final Project: Estimating Air Pollutant Levels

Tong Chen, Zijing Gu, Yiming Wen, Jiahao Wu

In [1]:
import pandas as pd
import datetime

Load Data to DataFrame

In [2]:
data_dir = "./PRSA_Data_20130301-20170228/"
# We select the 'Beijing Multi-Site Air-Quality Data Data Set' from the machine learning repository.
# The URL is: https://archive.ics.uci.edu/ml/datasets/Beijing+Multi-Site+Air-Quality+Data

# read the data from different location using pandas
aotizhongxin = pd.read_csv(data_dir + "PRSA_Data_Aotizhongxin_20130301-20170228.csv")
changping = pd.read_csv(data_dir + "PRSA_Data_Changping_20130301-20170228.csv")
dingling = pd.read_csv(data_dir + "PRSA_Data_Dingling_20130301-20170228.csv")
dongsi = pd.read_csv(data_dir + "PRSA_Data_Dongsi_20130301-20170228.csv")
guanyuan = pd.read_csv(data_dir + "PRSA_Data_Guanyuan_20130301-20170228.csv")
gucheng = pd.read_csv(data_dir + "PRSA_Data_Gucheng_20130301-20170228.csv")
huairou = pd.read_csv(data_dir + "PRSA_Data_Huairou_20130301-20170228.csv")
nongzhanguan = pd.read_csv(data_dir + "PRSA_Data_Nongzhanguan_20130301-20170228.csv")
shunyi = pd.read_csv(data_dir + "PRSA_Data_Shunyi_20130301-20170228.csv")
tiantan = pd.read_csv(data_dir + "PRSA_Data_Tiantan_20130301-20170228.csv")
wanliu = pd.read_csv(data_dir + "PRSA_Data_Wanliu_20130301-20170228.csv")
wanshouxigong = pd.read_csv(data_dir + "PRSA_Data_Wanshouxigong_20130301-20170228.csv")

Features

No: row number

year: year of data in this row

month: month of data in this row

day: day of data in this row

hour: hour of data in this row

PM2.5: PM2.5 concentration (ug/m$^3$)

PM10: PM10 concentration (ug/m$^3$)

SO2: SO$_2$ concentration (ug/m$^3$)

NO2: NO$_2$ concentration (ug/m$^3$)

CO: CO concentration (ug/m$^3$)

O3: O$_3$ concentration (ug/m$^3$)

TEMP: temperature (degree Celsius)

PRES: pressure (hPa)

DEWP: dew point temperature (degree Celsius)

RAIN: precipitation (mm)

wd: wind direction

WSPM: wind speed (m/s)

station: name of the air-quality monitoring site

Examine missing values

In [3]:
# Examine the missing values in a single data frame
aotizhongxin.isna().sum()
Out[3]:
No            0
year          0
month         0
day           0
hour          0
PM2.5       925
PM10        718
SO2         935
NO2        1023
CO         1776
O3         1719
TEMP         20
PRES         20
DEWP         20
RAIN         20
wd           81
WSPM         14
station       0
dtype: int64

Fill in missing values detected above

In [4]:
def fix_missing_value(input_df):
    # Fixing the missing numerical values by linear interpolation.
    input_df = input_df.interpolate(method='linear')
    # Map wind to numerical values
    mapping = {'N':0,'NNE':1*22.5,'NE':2*22.5,'ENE':3*22.5,'E':4*22.5,'ESE':5*22.5,'SE':6*22.5,
               'SSE':7*22.5,'S':8*22.5,'SSW':9*22.5,'SW':10*22.5,'WSW':11*22.5,'W':12*22.5,'WNW':13*22.5,
               'NW':14*22.5,'NNW':15*22.5}
    # Fill the missing wind with cloest data.
    input_df = input_df.applymap(lambda s: mapping.get(s) if s in mapping else s)
    input_df = input_df.fillna(method='ffill')
    # In order to group the data frame, we need to combine year month day hour so each row has a unique index.
    input_df['Detail_Date'] = input_df[['year','month','day','hour']].apply(lambda x : '{}+{}+{}+{}'.format(x[0],x[1],x[2],x[3]), axis=1)
    return input_df
In [5]:
# Fix missing values for all the data.
aotizhongxin = fix_missing_value(aotizhongxin)
changping = fix_missing_value(changping)
dingling = fix_missing_value(dingling)
dongsi = fix_missing_value(dongsi)
guanyuan = fix_missing_value(guanyuan)
gucheng = fix_missing_value(gucheng)
huairou = fix_missing_value(huairou)
nongzhanguan = fix_missing_value(nongzhanguan)
shunyi = fix_missing_value(shunyi)
tiantan = fix_missing_value(tiantan)
wanliu = fix_missing_value(wanliu)
wanshouxigong = fix_missing_value(wanshouxigong)

Function that groups DataFrame by feature and averages by date

In [6]:
# Combine data frame on detail date, take the average.
# Input: array of df.
def group_df(df_array):
    final_df = df_array[0]
    for i in range(1,len(df_array)):
        final_df = pd.concat([final_df, df_array[i]]).groupby('Detail_Date').mean()
    return final_df
In [7]:
# List of urban area of Beijing
urban = [aotizhongxin, dongsi, guanyuan, gucheng, nongzhanguan, tiantan, wanliu, wanshouxigong]
# List of suburb area of Beijing
suburb = [huairou, dingling, changping, shunyi]

Group by urban areas and suburb areas

In [8]:
# take the average of data for urban and suburb areas.
urban_df = group_df(urban)
suburb_df = group_df(suburb)
/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:6: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  
In [9]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
import numpy as np
import matplotlib.pyplot as plt

Fit the data using Polynomial Regression

In [10]:
# Parameters: Month, hour, PRES, temp, WSPM, rain
# Prediction: PM2.5, PM10, SO2
total_df = group_df([urban_df, suburb_df])
parameters = total_df[['month','hour','TEMP','WSPM','RAIN','DEWP']]
X = np.array(parameters.values.tolist())
total_df.describe()
# {'CO':[100,9700],'NO2':[2,228],'O3':[0.321300,334.000000],'PM10':[3.000000,951.500000],'PM2.5':[3.000000,970.000000],'SO2':[0.571200,325.000000]}
# {'month':(1,12),'hour':(0,24),'PRES':(986.550000,1042.400000),'TEMP':(-16.800000,40.550000),'WSPM':(1.776617,13.000000),'RAIN':(0.000000,29.150000),'DEWP':(-35.300000,27.800000)}
Out[10]:
CO DEWP NO2 No O3 PM10 PM2.5 PRES RAIN SO2 TEMP WSPM day hour month wd year
count 35064.000000 35064.000000 35064.000000 35064.000000 35064.000000 35064.000000 35064.000000 35064.000000 35064.000000 35064.000000 35064.000000 35064.000000 35064.000000 35064.000000 35064.000000 35064.000000 35064.000000
mean 1285.761614 2.578186 49.795049 17532.500000 55.069395 105.888865 82.252755 1012.294071 0.062648 15.404662 13.578669 1.776617 15.729637 11.500000 6.522930 155.969263 2014.662560
std 1128.262268 13.688803 31.006301 10122.249256 54.226856 90.291571 80.953033 10.287780 0.645239 20.773979 11.416247 1.142264 8.800218 6.922285 3.448752 85.053759 1.177213
min 100.000000 -35.300000 2.000000 1.000000 0.321300 3.000000 3.000000 986.550000 0.000000 0.571200 -16.800000 0.000000 1.000000 0.000000 1.000000 0.000000 2013.000000
25% 550.000000 -8.650000 25.500000 8766.750000 11.500000 38.000000 22.500000 1003.750000 0.000000 3.000000 3.100000 1.000000 8.000000 5.750000 4.000000 90.000000 2014.000000
50% 950.000000 3.150000 44.000000 17532.500000 41.500000 86.000000 59.500000 1012.000000 0.000000 6.500000 14.550000 1.450000 16.000000 11.500000 7.000000 157.500000 2015.000000
75% 1600.000000 15.250000 68.000000 26298.250000 77.000000 145.000000 113.500000 1020.500000 0.000000 19.000000 23.350000 2.200000 23.000000 17.250000 10.000000 202.500000 2016.000000
max 9700.000000 27.800000 228.500000 35064.000000 334.000000 951.500000 970.000000 1042.400000 29.150000 325.000000 40.550000 13.000000 31.000000 23.000000 12.000000 337.500000 2017.000000
In [11]:
# Grab traning and test data
out_put_pm25 = total_df[['PM2.5']]
out_put_pm10 = total_df[['PM10']]
out_put_so2 = total_df[['SO2']]
out_put_co = total_df[['CO']]
out_put_no2 = total_df[['NO2']]
out_put_o3 = total_df[['O3']]
Y_pm25 = np.array(out_put_pm25.values.tolist())
Y_pm10 = np.array(out_put_pm10.values.tolist())
Y_so2 = np.array(out_put_so2.values.tolist())
Y_co = np.array(out_put_co.values.tolist())
Y_no2 = np.array(out_put_no2.values.tolist())
Y_o3 = np.array(out_put_o3.values.tolist())
In [12]:
# Generate coefficient map.
poly = PolynomialFeatures(degree=3)
parameters_poly = poly.fit_transform(parameters)
names = poly.fit(parameters)
print(names.get_feature_names())
['1', 'x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x0^2', 'x0 x1', 'x0 x2', 'x0 x3', 'x0 x4', 'x0 x5', 'x1^2', 'x1 x2', 'x1 x3', 'x1 x4', 'x1 x5', 'x2^2', 'x2 x3', 'x2 x4', 'x2 x5', 'x3^2', 'x3 x4', 'x3 x5', 'x4^2', 'x4 x5', 'x5^2', 'x0^3', 'x0^2 x1', 'x0^2 x2', 'x0^2 x3', 'x0^2 x4', 'x0^2 x5', 'x0 x1^2', 'x0 x1 x2', 'x0 x1 x3', 'x0 x1 x4', 'x0 x1 x5', 'x0 x2^2', 'x0 x2 x3', 'x0 x2 x4', 'x0 x2 x5', 'x0 x3^2', 'x0 x3 x4', 'x0 x3 x5', 'x0 x4^2', 'x0 x4 x5', 'x0 x5^2', 'x1^3', 'x1^2 x2', 'x1^2 x3', 'x1^2 x4', 'x1^2 x5', 'x1 x2^2', 'x1 x2 x3', 'x1 x2 x4', 'x1 x2 x5', 'x1 x3^2', 'x1 x3 x4', 'x1 x3 x5', 'x1 x4^2', 'x1 x4 x5', 'x1 x5^2', 'x2^3', 'x2^2 x3', 'x2^2 x4', 'x2^2 x5', 'x2 x3^2', 'x2 x3 x4', 'x2 x3 x5', 'x2 x4^2', 'x2 x4 x5', 'x2 x5^2', 'x3^3', 'x3^2 x4', 'x3^2 x5', 'x3 x4^2', 'x3 x4 x5', 'x3 x5^2', 'x4^3', 'x4^2 x5', 'x4 x5^2', 'x5^3']
In [13]:
def train_save(parameters_poly, Y_pm25, coef_map, bias_map, name):
    X_train = parameters_poly[:-500]
    Y_pm25_train = Y_pm25[:-500]
    X_test = parameters_poly[-500:]
    Y_pm25_test = Y_pm25[-500:]
    clf = linear_model.LinearRegression(normalize=True)
    clf.fit(X_train, Y_pm25_train)
    coef_map['PM2.5'] = clf.coef_.tolist()
    bias_map['PM2.5'] = clf.intercept_[0]
    predict = clf.predict(X_test)
    plt.plot(Y_pm25_test,label='truth')
    plt.plot(predict,label='predict')
    plt.ylabel('measurement')
    plt.xlabel('test days')
    plt.title("train result for " + name)
    plt.legend()
    plt.show()
In [14]:
# Train and save the coefficient for use on website
coef_map = {}
bias_map = {}

train_save(parameters_poly, Y_pm25, coef_map, bias_map, "pm25")
train_save(parameters_poly, Y_pm10, coef_map, bias_map, "pm10")
train_save(parameters_poly, Y_so2, coef_map, bias_map, "so2")
train_save(parameters_poly, Y_co, coef_map, bias_map, "co")
train_save(parameters_poly, Y_no2, coef_map, bias_map, "no2")
train_save(parameters_poly, Y_o3, coef_map, bias_map, "o3")

Calculate t

In [15]:
#reference: https://www.jianshu.com/p/e12634c2cb54
def cal_linear(Il,Ih,cl,ch,cp):
    iaqi = (Ih-Il)*(cp - cl)/(ch-cl) + Il
    return iaqi

def cal_pm25_iaqi(pm_val):
    if 0 <= pm_val <= 35:
        iaqi = cal_linear(0,50,0,35,pm_val)
    elif 36 <= pm_val <=75:
        iaqi = cal_linear(50,100,35,75,pm_val) 
    elif 76 <= pm_val <=115:
        iaqi = cal_linear(100,150,75,115,pm_val) 
    elif 116 <= pm_val <=150:
        iaqi = cal_linear(150,200,115,150,pm_val) 
    elif 151 <= pm_val <=250:
        iaqi = cal_linear(200,300,150,250,pm_val) 
    elif 251 <= pm_val <=350:
        iaqi = cal_linear(300,400,250,350,pm_val) 
    else:
        iaqi = cal_linear(400,500,350,500,pm_val)
    return iaqi

def cal_so2_iaqi(o3_val):
    if 0<=o3_val<=50:
        iaqi = cal_linear(0,50,0,50,o3_val)
    elif 51 <= o3_val <=150:
        iaqi = cal_linear(50,100,50,150,o3_val) 
    elif 151 <= o3_val <=475:
        iaqi = cal_linear(100,150,150,475,o3_val) 
    elif 476 <= o3_val <= 800:
        iaqi = cal_linear(150,200,475,800,o3_val) 
    elif 801 <= o3_val <= 1600:
        iaqi = cal_linear(200,300,800,1600,o3_val) 
    elif 1601 <= o3_val <= 2100:
        iaqi = cal_linear(300,400,1600,2100,o3_val) 
    else:
        iaqi = cal_linear(400,500,2100,2620,o3_val)
    return iaqi

def cal_pm10_iaqi(pm_val):
    if 0 <= pm_val <= 50:
        iaqi = cal_linear(0,50,0,50,pm_val)
    elif 51 <= pm_val <= 150:
        iaqi = cal_linear(50,100,50,150,pm_val) 
    elif 151 <= pm_val <= 250:
        iaqi = cal_linear(100,150,150,250,pm_val) 
    elif 251 <= pm_val <=350:
        iaqi = cal_linear(150,200,250,350,pm_val) 
    elif 351 <= pm_val <=420:
        iaqi = cal_linear(200,300,350,420,pm_val) 
    elif 421 <= pm_val <=500:
        iaqi = cal_linear(300,400,420,500,pm_val)
    else:
        iaqi = cal_linear(400,500,500,600,pm_val)
    return iaqi

def cal_no2_iaqi(pm_val):
    if 0 <= pm_val <= 40:
        iaqi = cal_linear(0,50,0,40,pm_val)
    elif 41 <= pm_val <=80:
        iaqi = cal_linear(50,100,40,80,pm_val) 
    elif 81 <= pm_val <=180:
        iaqi = cal_linear(100,150,80,180,pm_val) 
    elif 181 <= pm_val <=280:
        iaqi = cal_linear(150,200,180,280,pm_val) 
    elif 281 <= pm_val <=565:
        iaqi = cal_linear(200,300,280,565,pm_val) 
    elif 566 <= pm_val <=750:
        iaqi = cal_linear(300,400,565,750,pm_val) 
    else:
        iaqi = cal_linear(400,500,750,940,pm_val)
    return iaqi

def cal_co_iaqi(pm_val):
    pm_val = pm_val/1000
    if 0 <= pm_val <= 5:
        iaqi = cal_linear(0,50,0,5,pm_val)
    elif 6 <= pm_val <=10:
        iaqi = cal_linear(50,100,5,10,pm_val) 
    elif 11 <= pm_val <=35:
        iaqi = cal_linear(100,150,10,35,pm_val) 
    elif 36 <= pm_val <=60:
        iaqi = cal_linear(150,200,35,60,pm_val) 
    elif 61 <= pm_val <=90:
        iaqi = cal_linear(200,300,60,90,pm_val) 
    elif 91 <= pm_val <=120:
        iaqi = cal_linear(300,400,90,120,pm_val) 
    else:
        iaqi = cal_linear(400,500,120,150,pm_val)
    return iaqi

def cal_o3_iaqi(pm_val):
    if 0 <= pm_val <= 160:
        iaqi = cal_linear(0,50,0,160,pm_val)
    elif 161 <= pm_val <=200:
        iaqi = cal_linear(50,100,160,200,pm_val) 
    elif 201 <= pm_val <=300:
        iaqi = cal_linear(100,150,200,300,pm_val) 
    elif 301 <= pm_val <=400:
        iaqi = cal_linear(150,200,300,400,pm_val) 
    elif 401 <= pm_val <=800:
        iaqi = cal_linear(200,300,400,800,pm_val) 
    elif 801 <= pm_val <=1000:
        iaqi = cal_linear(300,400,800,1000,pm_val) 
    else:
        iaqi = cal_linear(400,500,1000,1200,pm_val)
    return iaqi

def aqi_val(param_list): 
    pm25_iaqi = cal_pm25_iaqi(param_list[0])
    so2_iaqi = cal_so2_iaqi(param_list[1])
    pm10_iaqi = cal_pm10_iaqi(param_list[2])
    no2_iaqi = cal_no2_iaqi(param_list[3])
    co_iaqi = cal_co_iaqi(param_list[4])
    o3_iaqi = cal_o3_iaqi(param_list[5])
    return max(pm25_iaqi, so2_iaqi, pm10_iaqi, no2_iaqi, co_iaqi, o3_iaqi)
In [16]:
aotizhongxin['AQI'] = aotizhongxin.apply(lambda x: aqi_val([x['PM2.5'],x['SO2'],x['PM10'],x['NO2'],x['CO'],x['O3']]), axis=1)
In [17]:
aotizhongxin.to_csv('AQI.csv', columns = ['year','month','day','hour','AQI'], index=False)