2020-06-14-logisticRegressionBasic/python-opencv-imageProcess

Record

Posted by Jocelyn on June 14, 2020

2020-06-14-recording.

Logistic Regression

# -*- coding: UTF-8 -*-
#逻辑回归
from argparse import Namespace
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import urllib
import urllib.request#这里是因为在python3中urllib.request成为了一个独立包
#参数
args=Namespace(
    seed=1234,
    data_file="titanic.csv",
    train_size=0.75,
    test_size=0.25,
    num_epochs=100,
)
#设置随机种子来保证实验结果的可重复性
np.random.seed(args.seed)
#从GitHub上加载数据到notebook本地
url="https://raw.githubusercontent.com/GokuMohandas/practicalAI/master/data/titanic.csv"#这个网址目前失效了
response = urllib.request.urlopen(url)
html = response.read()
with open(args.data_file, 'wb') as f:
    f.write(html)
#把CSV文件内容读到DataFrame中
df = pd.read_csv(args.data_file,header=0)
df.head()

#scikit-learn实现

#scikit-lear中logisticRegression类使用的是坐标下降法(coordinate descent)来做的拟合,这里使用的是scikit-learn中的SGDClassifier类来做随机梯度下降

#调包
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#预处理
def preprocess(df):
    #删除掉含有空值的行
    df=df.dropna()
    #删除基于文本的特征
    features_to_drop=["name","cabin","ticket"]
    df = df.drop(features_to_drop,axis=1)

    #pclass,sex 和embarked是类别变量
    categorical_features=["pclass","embarked","sex"]
    df = pd.get_dummies(df,columns=categorical_features)
    return df
#数据预处理
df=preprocess(df)
df.head()
#划分数据到训练集和测试集
mask=np.random.rand(len(df))<args.train_size
train_df=df[mask]
test_df=df[~mask]
print("Train size: {0},test size: {1}".format(len(train_df),len(test_df)))
#分离 X和y
X_train =train_df.drop(["survived"],axis=1)
y_train =train_df["survived"]
X_test = test_df.drop(["survived"],axis=1)
y_test = test_df["survived"]

#标准化训练数据(mean=0 std=1)
X_scaler = StandardScaler().fit(X_train)
#标准化训练和测试数据(不要标准化标签分类y)
standardized_X_train = X_scaler.transform(X_train)
standardized_X_test = X_scaler.transform(X_test)
#检查
print("mean:",np.mean(standardized_X_train,axis=0))
print("std:",np.std(standardized_X_train,axis=0))
#初始化模型
log_reg = SGDClassifier(loss="log",penalty="none",max_iter=args.num_epochs,random_state=args.seed)
#训练
log_reg.fit(X=standardized_X_train,y=y_train)
#概率
pred_test=log_reg.predict_proba(standardized_X_test)
print(pred_test[:5])
#预测(为标准化)
pred_train=log_reg.predict(standardized_X_train)
pred_test = log_reg.predict(standardized_X_test)
print(pred_test)

#评估
from sklearn.metrics import accuracy_score
#正确率
train_acc = accuracy_score(y_train,pred_train)
test_acc = accuracy_score(y_test,pred_test)
print("train acc: {0:.2f},test acc: {1:.2f}".format(train_acc,test_acc))
#目前为止使用的是正确率来作为评价指标来评价模型的好坏程度,但是还有很多指标来对模型进行评价

#推论
#来查看自己能否在Titanic中存活下来
X_infer=pd.DataFrame([{"name": "Goku Mohandas", "cabin": "E", "ticket": "E44",
                         "pclass": 1, "age": 24, "sibsp": 1, "parch": 2,
                         "fare": 100, "embarked": "C", "sex": "male"}])
X_infer.head()
#进行预处理
X_infer = preprocess(X_infer)
X_infer.head()
#添加缺失列向量
missing_features = set(X_test.columns)-set(X_infer.columns)
for feature in missing_features:
    X_infer[feature] = 0
#重整title
X_infer = X_infer[X_train.columns]
X_infer.head()
#标准化
standardized_X_infer = X_scaler.transform(X_infer)
#预测
y_infer = log_reg.predict(standardized_X_infer)
classes = {0:"died",1:"survived"}
_class = np.argmax(y_infer)
print("Looks like I would've {0} with about {1:.0f}% probality on the Titanic expedition!".format(
    classes[_class],y_infer[0][_class]*100.0
))
# 未标准化系数
coef = log_reg.coef_ / X_scaler.scale_
intercept = log_reg.intercept_ - np.sum((coef * X_scaler.mean_))
print (coef)
print (intercept)

indices = np.argsort(coef)
features = list(X_train.columns)
print ("Features correlated with death:", [features[i] for i in indices[0][:3]])
print ("Features correlated with survival:", [features[i] for i in indices[0][-3:]])

#K折交叉验证
#交叉验证是一个重采样的模型评估方法,与其在一开始仅划分一次训练集和验证集,使用交叉验证来划分K(通常k=5 或者10)次不同的训练集和验证集
# 1.随机打乱训练数据集train
# 2.将数据集分割为不同的k个片段
# 3.在K次的每次循环中选择一个片段来当做验证集,其余的所有片段当成训练集
# 4.重复这个过程使每个片段都有可能成为训练集或者测试集的一部分
# 5.随机初始化权重来训练模型
# 6.在K个循环中每次都要重新初始化模型,但是权重要保持相同的随机初始化,然后再在验证集中进行验证

from sklearn.model_selection import cross_val_score
#K折交叉验证
log_reg = SGDClassifier(loss="log",penalty="none",max_iter=args.num_epochs)
scores = cross_val_score(log_reg,standardized_X_train,y_train,cv=10,scoring="accuracy")
print("Scores:",scores)
print("Mean:",scores.mean())
print("Standard Deviation:",scores.std())

python-opencv-imageProcess-basic

# -*- coding: UTF-8 -*-
import cv2#在python中这里的2并不表示opencv的版本号,cv和cv2分别表示的是底层C API和C++ API
vc = cv2.videoCapture('test.mp4')
if vc.isOpened():
    open,frame = vc.read()
else:
    open = False

while open:
    ret,frame = vc.read()
    if frame is None:
        break
    if ret ==True:
        gray = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY)
        cv2.imshow('result',gray)
        if cv2.waitKey(1)&0xFF==27:#waitKey=1表示计算机性能有多快 处理的有多快,27表示的是退出键
            break
vc.release()
cv2.destroyAllWindows()

img = cv2.imread('cat.jpg')
b,g,r =cv2.split(img)
b.shape#表示的是(h,w,c)高,宽,通道
#如果只保留R通道
cur_img = img.copy()
cur_img[:,:,0]=0
cur_img[:,0,:]=0
cv_show('R',cur_img)#只保留R通道就将 B G通道置为0

#边界填充
top_size,bottom_size,left_size,right_size=(50,50,50,50)
#所用函数 copyMakeBorder
result = cv2.copyMakeBorder(img,top_size,bottom_size,left_size,right_size,borderType=cv2.BORDER_REPLICATE)
#以上只是一个例子,borderType还有BORDER_REFLECT BORDER_REFLECT_101 BORDER_WRAP BORDER_CONSTANT
#BORDER_REPLICATE 复制最边缘的像素,不断重复
#BORDER_REFLECT 反射法,fedcba|abcdefgh|hgfedcba
#BORDER_REFLECT_101 反射法,以最边缘像素为轴,对称 gfedcb|abcedfgh|gfdecba省掉了中间一个值
#BORDER_WRAP  cdefgh|abcdefgh|abcdefg直接整个复制
#BORDER_CONSTANT

#数值计算
img_cat = cv2.imread('cat.jpg')
img_dog = cv2.imread('dog.jpg')
img_cat2 = img_cat+10#表示在每个元素上+10
(img_cat+img_dog)#这个结果会存在疑虑,因为数据类型dtype=uint8,所以表示的数据范围为0-255 所以会对相加后的像素值取余
cv2.add(img_cat,img_dog)#这个结果会跟上面的不一样,因为如果>255结果就会取255
#图像融合
#img_cat.shape #(414,500,3)
#img_dog.shape #(429,499,3)
#所以img_cat + img_dog会出错
img_dog = cv2.resize(img_dog,(500,414))#这时候的resize (w,h)注意跟shape表示的(h w c)的区别
img_dog.shape#(414,500,3)#

res = cv2.resize(img,(0,0),fx=3,fy=1)#后面两个参数分别表示在长宽上放大的倍数


res=cv2,addWeighted(img_cat,0.4,img_dog,0.6,0)#这里表示按照一定的权重进行融合,最后一个表示一个b

##