\(\text{If }\sum_iw_ix_i+b\ge 0\)
\[\begin{align} f(x)=1 \end{align} \]\(\text{Otherwise, } f(x)=0\)
\(\large \textbf{Perceptron Algorithm:}\)
\(\text{Code}\):
def train_perceptron(x, y, nb_epochs_max): w = torch.zeros(x.size(1)) for e in range(nb_epochs_max): nb_changes = 0 for i in range(x.size(0)): ### cross the batch_size if x[i].dot(w) * y[i] <= 0: w = w + y[i] * x[i] nb_changes = nb_changes + 1 if nb_changes == 0: break; return w
\(\text{We can get convergence under 2 assumptions:}\)
\(\large\text{The large the margin, the more quickly the algorithm classifies all the samples correctly.}\)
\(\textbf{SVM }\text{achieve this by minimizing:}\)
\[\begin{align} \mathcal{L}(w,b) = \lambda||w||^2+\frac{1}{N}\sum_n\max(0,1-y_n(w\cdot x_n+b)) \end{align} \]\(\textbf{which is convex and has a global optimum.}\)
\(\large\text{Hinge Loss:}\)
\[\begin{align} \max(0,1-a) \end{align} \]\(\text{Consider the following class populations:}\)
\[\begin{align} \mu_{X|Y=y}(x) = \frac{1}{\sqrt{(2\pi)^D|\Sigma|}}\exp{[-\frac{1}{2}(x-m_y)\Sigma^{-1}(x-m_y)^T]} \end{align} \]\(\text{where }\forall y\in\{0,1\} ,x\in\mathbb{R}^D\)
\(\text{We have:}\)
\[\begin{align} P(Y=1|X=x)&=\frac{\mu_{X|Y=1}(x)P(Y=1)}{\mu_X(x)}\\ &=\frac{\mu_{X|Y=1}(x)P(Y=1)}{\mu_{X|Y=0}(x)P(Y=0)+\mu_{X|Y=1}(x)P(Y=1)}\\ &=\frac{1}{1+\frac{\mu_{X|Y=0}(x)P(Y=0)}{\mu_{X|Y=1}(x)P(Y=1)}}\\ &=\sigma(\log{\frac{\mu_{X|Y=1}(x)}{\mu_{X|Y=0}(x)}}+\log{\frac{P(Y=1)}{P(Y=0)}}) \end{align} \]\(\text{where}\)
\[\begin{align} \sigma(x) = \frac{1}{1+e^{-x}} \end{align} \]\(\text{So with our Gaussians }\mu_{X|Y=y}\text{ of the same }\Sigma,\text{ we get:}\)
\[\begin{align} P(Y=1|X=x)&=\sigma(\log{\frac{\mu_{X|Y=1}(x)}{\mu_{X|Y=0}(x)}}+\log{\frac{P(Y=1)}{P(Y=0)}})\\ &=\sigma(\log{\mu_{X|Y=1}}-\log{\mu_{X|Y=0}}+Z)\\ &=\sigma({-\frac{1}{2}(x-m_1)\Sigma^{-1}(x-m_1)^T}+{\frac{1}{2}(x-m_1)\Sigma^{-1}(x-m_1)^T}+Z)\\ &=\sigma[(m_1-m_0)\Sigma^{-1}x^T+\frac{1}{2}(m_0\Sigma^{-1}m_0^T-m_1\Sigma^{-1}m_1^T)+Z]\\ &=\sigma(w\cdot x+b) \end{align} \]\(\textbf{Universal Approximation Theorem}\)
\(\text{A better appromixation requires a larger hidden layer, which means that we can make the }\textbf{training error }\text{as low as we want. It states nothing about the }\textbf{test error}.\)
\(\text{Consider logistic regression loss:}\)
\[{L}(w,b) = -\sum_n\log{\sigma[y_n(w\cdot x_n+b)]} \]\(\text{Therefore:}\)
\[\begin{align} \frac{\partial L}{\partial w_d} = -\sum_n x_{n,d}y_n\sigma[-y_n(w\cdot x_n+b)] \end{align} \]\(\textbf{Note that:}\)
\[[\log{\sigma(x)}]' = \sigma(-x) \]\(\textbf{Code}:\)
def gradient(x, y, w, b): ### bias's gradient u = y * ( - y * (x @ w + b)).sigmoid() ### W's gradient v = x * u.view(-1, 1) # Broadcasting return - v.sum(0), - u.sum(0)
\(\text{Forward:}\)
\[\begin{cases} s^{(l)} &= w^{(l)}x^{(l-1)}+b^{(l)}\\ x^{(l)} &= \sigma[s^{(l)}] \end{cases} \]\(\text{Backward:}\)
\[\begin{align} \frac{\partial L}{\partial s^{(l)}}&= \frac{\partial L}{\partial x^{(l)}}\odot\sigma'[s^{(l)}]\\ \frac{\partial L}{\partial x^{(l)}} &= [w^{(l+1)}]^T\frac{\partial L}{\partial s^{(l+1)}}\\ \frac{\partial L}{\partial w^{(l)}}&=\frac{\partial L}{\partial s^{(l)}}[x^{(l-1)}]^T\\ \frac{\partial L}{\partial b^{(l)}} &=\frac{\partial L}{\partial s^{(l)}} \end{align} \]