X
=
[
−
x
(
1
)
T
−
−
x
(
2
)
T
−
⋯
−
x
(
i
)
T
−
⋯
−
x
(
m
)
T
−
]
\mathbf X= \begin{bmatrix}-\mathbf {x^{(1)}}^T - \\-\mathbf {x^{(2)}}^T- \\\cdots\\-\mathbf {x^{(i)}}^T-\\\cdots\\-\mathbf {x^{(m)}}^T-\end{bmatrix}
X=⎣⎡−x(1)T−−x(2)T−⋯−x(i)T−⋯−x(m)T−⎦⎤
m
×
n
m\times n
m×n
输入
x
(
i
)
=
[
x
1
(
i
)
x
2
(
i
)
⋯
x
j
(
i
)
⋯
x
n
(
i
)
]
T
\mathbf x^{(i)}=\begin{bmatrix} x_{1}^{(i)} & x_{2}^{(i)} & \cdots & x_{j}^{(i)} & \cdots & x_{n}^{(i)}\end{bmatrix}^T
x(i)=[x1(i)x2(i)⋯xj(i)⋯xn(i)]T
n
×
1
n\times 1
n×1
标签
y
=
[
y
(
1
)
y
(
2
)
⋯
y
(
i
)
⋯
y
(
m
)
]
T
\mathbf y={\begin{bmatrix} y^{(1)} & y^{(2)} & \cdots & y^{(i)} &\cdots &y^{(m)}\end{bmatrix}}^T
y=[y(1)y(2)⋯y(i)⋯y(m)]T
m
×
1
m\times 1
m×1
参数
w
=
[
w
1
w
2
⋯
w
j
⋯
w
n
]
T
\mathbf w={\begin{bmatrix}w_{1} & w_{2} & \cdots & w_{j} & \cdots & w_{n}\end{bmatrix}}^T
w=[w1w2⋯wj⋯wn]T
n
×
1
n\times 1
n×1
输出
f
w
,
b
(
x
(
i
)
)
=
g
(
w
T
x
(
i
)
+
b
)
g
(
z
)
=
1
1
+
e
−
z
\begin{aligned}f_{\mathbf w,b}(\mathbf x^{(i)}) &=g({\mathbf w}^T{\mathbf x}^{(i)} + b) \\ g(z) &= \frac{1}{1+e^{-z}} \end{aligned}
fw,b(x(i))g(z)=g(wTx(i)+b)=1+e−z1
标量
输出(矩阵形式)
f
w
,
b
(
X
)
=
g
(
X
w
+
b
)
f_{\mathbf w,b}(\mathbf X) = g(\mathbf X \mathbf w+ b)
fw,b(X)=g(Xw+b)
m
×
1
m\times 1
m×1
预测
y
^
(
i
)
=
{
1
if
f
w
,
b
(
x
(
i
)
)
≥
0.5
0
if
f
w
,
b
(
x
(
i
)
)
<
0.5
\hat{y}^{(i)}= \begin{cases} 1 & \text{if }f_{\mathbf w,b}(\mathbf x^{(i)})\ge 0.5\\ 0 & \text{if }f_{\mathbf w,b}(\mathbf x^{(i)}) <0.5\end{cases}
y^(i)={10if fw,b(x(i))≥0.5if fw,b(x(i))<0.5
标量
损失函数
c
o
s
t
(
i
)
=
{
−
log
(
f
w
,
b
(
x
(
i
)
)
)
if
y
(
i
)
=
1
−
log
(
1
−
f
w
,
b
(
x
(
i
)
)
)
if
y
(
i
)
=
0
=
−
y
(
i
)
log
(
f
w
,
b
(
x
(
i
)
)
)
−
(
1
−
y
(
i
)
)
log
(
1
−
f
w
,
b
(
x
(
i
)
)
)
\begin{aligned}cost^{(i)} &= \begin{cases} -\log\left(f_{\mathbf{w},b}\left( \mathbf{x}^{(i)} \right) \right) & \text{if }y^{(i)}=1\\ -\log \left( 1 - f_{\mathbf{w},b}\left( \mathbf{x}^{(i)} \right) \right)&\text{if }y^{(i)}=0\end{cases} \\ &=-y^{(i)} \log\left(f_{\mathbf{w},b}\left( \mathbf{x}^{(i)} \right) \right) - \left( 1 - y^{(i)}\right) \log \left( 1 - f_{\mathbf{w},b}\left( \mathbf{x}^{(i)} \right) \right)\end{aligned}
cost(i)={−log(fw,b(x(i)))−log(1−fw,b(x(i)))if y(i)=1if y(i)=0=−y(i)log(fw,b(x(i)))−(1−y(i))log(1−fw,b(x(i)))
标量
代价函数
J
(
w
,
b
)
=
1
m
∑
i
=
1
m
c
o
s
t
(
i
)
+
λ
2
m
∑
j
=
1
n
w
j
2
=
1
m
(
−
y
T
l
o
g
(
f
w
,
b
(
X
)
)
−
(
I
−
y
)
T
l
o
g
(
I
−
f
w
,
b
(
X
)
)
)
+
λ
2
m
w
T
w
\begin{aligned}J(\mathbf w,b) &= \frac{1}{m} \sum\limits_{i = 1}^{m} cost^{(i)}+\frac{\lambda}{2m}\sum\limits_{j = 1}^{n} w_{j}^2\\&=\frac{1}{m}\left(-\mathbf y^Tlog(f_{\mathbf w,b}(\mathbf X))-(\Iota-\mathbf y)^Tlog(\Iota-f_{\mathbf w,b}(\mathbf X))\right)+\frac{\lambda}{2m}\mathbf w^T\mathbf w\end{aligned}
J(w,b)=m1i=1∑mcost(i)+2mλj=1∑nwj2=m1(−yTlog(fw,b(X))−(I−y)Tlog(I−fw,b(X)))+2mλwTw
标量
梯度下降
w
j
:
=
w
j
−
α
∂
J
(
w
,
b
)
∂
w
j
b
:
=
b
−
α
∂
J
(
w
,
b
)
∂
b
∂
J
(
w
,
b
)
∂
w
j
=
1
m
∑
i
=
1
m
(
f
w
,
b
(
x
(
i
)
)
−
y
(
i
)
)
x
j
(
i
)
+
λ
m
w
j
∂
J
(
w
,
b
)
∂
b
=
1
m
∑
i
=
1
m
(
f
w
,
b
(
x
(
i
)
)
−
y
(
i
)
)
\begin{aligned}w_j :&= w_j - \alpha \frac{\partial J(\mathbf{w},b)}{\partial w_j}\\ b :&= b - \alpha \frac{\partial J(\mathbf{w},b)}{\partial b}\\\frac{\partial J(\mathbf{w},b)}{\partial w_j} &= \frac{1}{m} \sum\limits_{i = 1}^{m} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})x_{j}^{(i)} + \frac{\lambda}{m} w_j \\ \frac{\partial J(\mathbf{w},b)}{\partial b} &= \frac{1}{m} \sum\limits_{i = 1}^{m} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)}) \end{aligned}
wj:b:∂wj∂J(w,b)∂b∂J(w,b)=wj−α∂wj∂J(w,b)=b−α∂b∂J(w,b)=m1i=1∑m(fw,b(x(i))−y(i))xj(i)+mλwj=m1i=1∑m(fw,b(x(i))−y(i))
标量
梯度下降(矩阵形式)
w
:
=
w
−
α
∂
J
(
w
,
b
)
∂
w
∂
J
(
w
,
b
)
∂
w
=
1
m
X
T
(
f
w
,
b
(
X
)
−
y
)
+
λ
m
w
\begin{aligned}\mathbf w:&=\mathbf w-\alpha\frac{\partial J(\mathbf{w},b)}{\partial \mathbf{w}}\\ \frac{\partial J(\mathbf{w},b)}{\partial \mathbf{w}}&=\frac{1}{m}\mathbf X^T(f_{\mathbf w,b}(\mathbf X) -\mathbf y)+\frac{\lambda}{m} \mathbf w\end{aligned}
w:∂w∂J(w,b)=w−α∂w∂J(w,b)=m1XT(fw,b(X)−y)+mλw