using Plots, LinearAlgebra, Random, StatsBase, SparseArrays, CSV, DataFrames, Tables
content = rowtable(CSV.read("ncaa-2023.csv", DataFrame, header=false))
5610-element Vector{@NamedTuple{Column1::String, Column2::String15, Column3::String, Column4::Float64, Column5::Float64, Column6::Int64, Column7::Int64, Column8::Int64, Column9::Int64, Column10::Float64, Column11::Float64}}: (Column1 = "Mississippi Valley St.Baylor11-7", Column2 = "11/7/22", Column3 = "Baylor -37.0, 95-58 (100%)", Column4 = 58.08219264, Column5 = 95.10164256, Column6 = 53, Column7 = 117, Column8 = 355, Column9 = 2, Column10 = 72.61017799, Column11 = 29.83936953) (Column1 = "BresciaMiddle Tennessee11-7", Column2 = "11/7/22", Column3 = "Middle Tennessee (100%)", Column4 = 51.57063963, Column5 = 92.52730733, Column6 = 52, Column7 = 79, Column8 = 0, Column9 = 100, Column10 = 70.6356741, Column11 = 0.55918372) (Column1 = "PacificStanford11-7", Column2 = "11/7/22", Column3 = "Stanford -19.1, 82-62 (96%)", Column4 = 62.4924586, Column5 = 81.59700436, Column6 = 78, Column7 = 88, Column8 = 254, Column9 = 37, Column10 = 70.3686906, Column11 = 35.83878057) (Column1 = "Valley ForgeJames Madison11-7", Column2 = "11/7/22", Column3 = "James Madison (100%)", Column4 = 56.12414098, Column5 = 95.1918094, Column6 = 38, Column7 = 123, Column8 = 0, Column9 = 157, Column10 = 72.10854632, Column11 = -2.130669557) (Column1 = "JWU (Charlotte)UNC Greensboro11-7", Column2 = "11/7/22", Column3 = "UNC Greensboro (100%)", Column4 = 50.26946273, Column5 = 89.43981154, Column6 = 60, Column7 = 93, Column8 = 0, Column9 = 108, Column10 = 69.30809531, Column11 = -0.968387195) (Column1 = "HarvardMorehouse11-7", Column2 = "11/7/22", Column3 = "Harvard (100%)", Column4 = 82.66080133, Column5 = 54.41964239, Column6 = 68, Column7 = 63, Column8 = 164, Column9 = 0, Column10 = 70.69162206, Column11 = -3.1999939) (Column1 = "DefianceOakland11-7", Column2 = "11/7/22", Column3 = "Oakland (100%)", Column4 = 56.31633105, Column5 = 90.91803277, Column6 = 27, Column7 = 92, Column8 = 0, Column9 = 213, Column10 = 71.1807591, Column11 = -6.139759419) (Column1 = "Coppin St.Charlotte11-7", Column2 = "11/7/22", Column3 = "Charlotte -10.9, 83-72 (84%)", Column4 = 71.73747765, Column5 = 82.68669043, Column6 = 59, Column7 = 82, Column8 = 304, Column9 = 180, Column10 = 73.97837139, Column11 = 33.15820053) (Column1 = "La SalleVillanova11-7", Column2 = "11/7/22", Column3 = "Villanova -15.2, 78-63 (92%)", Column4 = 62.90763241, Column5 = 78.1192195, Column6 = 68, Column7 = 81, Column8 = 156, Column9 = 17, Column10 = 69.98142267, Column11 = 44.63969223) (Column1 = "Fort WayneMichigan11-7", Column2 = "11/7/22", Column3 = "Michigan -13.0, 81-68 (88%)", Column4 = 67.65212191, Column5 = 80.61596127, Column6 = 56, Column7 = 75, Column8 = 147, Column9 = 38, Column10 = 70.80633829, Column11 = 47.29321085) (Column1 = "FisherUMass Lowell11-7", Column2 = "11/7/22", Column3 = "UMass Lowell (100%)", Column4 = 55.4776031, Column5 = 88.7214148, Column6 = 43, Column7 = 108, Column8 = 0, Column9 = 228, Column10 = 71.04435062, Column11 = -6.841735024) (Column1 = "New OrleansButler11-7", Column2 = "11/7/22", Column3 = "Butler -17.5, 84-66 (94%)", Column4 = 66.37333751, Column5 = 83.91084458, Column6 = 53, Column7 = 89, Column8 = 321, Column9 = 110, Column10 = 74.00732387, Column11 = 28.93818721) (Column1 = "Lebanon ValleyBucknell11-7", Column2 = "11/7/22", Column3 = "Bucknell (100%)", Column4 = 58.35162454, Column5 = 90.7786648, Column6 = 52, Column7 = 113, Column8 = 0, Column9 = 267, Column10 = 71.31113414, Column11 = -8.568614778) ⋮ (Column1 = "EvansvilleIllinois St.2-26", Column2 = "2/26/23", Column3 = "Illinois St. -9.1, 73-64 (82%)", Column4 = 64.11506566404985, Column5 = 73.22360392063257, Column6 = 53, Column7 = 72, Column8 = 348, Column9 = 287, Column10 = 67.70143746278178, Column11 = 21.597462385507335) (Column1 = "WisconsinMichigan2-26", Column2 = "2/26/23", Column3 = "Michigan -4.9, 66-61 (71%)", Column4 = 61.457675924637904, Column5 = 66.35741568390314, Column6 = 79, Column7 = 87, Column8 = 68, Column9 = 47, Column10 = 63.45026112919709, Column11 = 60.362252415967916) (Column1 = "Wichita St.Tulane2-26", Column2 = "2/26/23", Column3 = "Tulane -5.8, 78-72 (71%)", Column4 = 72.4234967579466, Column5 = 78.19234548772916, Column6 = 83, Column7 = 76, Column8 = 95, Column9 = 81, Column10 = 71.85067938277956, Column11 = 61.39030324895765) (Column1 = "UCLAColorado2-26", Column2 = "2/26/23", Column3 = "UCLA -6.7, 70-63 (76%)", Column4 = 70.12275807975539, Column5 = 63.3954204019498, Column6 = 60, Column7 = 56, Column8 = 3, Column9 = 80, Column10 = 69.07916008700157, Column11 = 67.59068192427755) (Column1 = "DrakeBradley2-26", Column2 = "2/26/23", Column3 = "Bradley -2.2, 67-65 (59%)", Column4 = 64.91285968915885, Column5 = 67.07176003739494, Column6 = 61, Column7 = 73, Column8 = 54, Column9 = 72, Column10 = 65.04641652008074, Column11 = 68.3591369545626) (Column1 = "UCFTulsa2-26", Column2 = "2/26/23", Column3 = "UCF -9.7, 74-64 (83%)", Column4 = 74.0566060873923, Column5 = 64.34699603415602, Column6 = 68, Column7 = 49, Column8 = 66, Column9 = 287, Column10 = 65.74941442865234, Column11 = 38.310522657010495) (Column1 = "ValparaisoMurray St.2-26", Column2 = "2/26/23", Column3 = "Murray St. -6.4, 76-70 (73%)", Column4 = 70.0319843529249, Column5 = 76.4250223649765, Column6 = 76, Column7 = 77, Column8 = 273, Column9 = 223, Column10 = 67.29922491001606, Column11 = 36.508178442747095) (Column1 = "RutgersPenn St.2-26", Column2 = "2/26/23", Column3 = "Penn St. -1.7, 67-65 (57%)", Column4 = 65.49269945316647, Column5 = 67.16562140473204, Column6 = 59, Column7 = 56, Column8 = 30, Column9 = 51, Column10 = 63.96814012860589, Column11 = 73.0180743188931) (Column1 = "WashingtonStanford2-26", Column2 = "2/26/23", Column3 = "Stanford -4.6, 73-69 (68%)", Column4 = 68.8418690708591, Column5 = 73.40993207092959, Column6 = 69, Column7 = 81, Column8 = 108, Column9 = 95, Column10 = 67.54958884796666, Column11 = 58.28396042896583) (Column1 = "Cal BaptistStephen F. Austin2-26", Column2 = "2/26/23", Column3 = "Stephen F. Austin -4.8, 70-65 (69%)", Column4 = 65.32491295566658, Column5 = 70.08624698312488, Column6 = 58, Column7 = 80, Column8 = 139, Column9 = 124, Column10 = 65.70546229716118, Column11 = 50.783821804477824) (Column1 = "WashingtonStanford2-26", Column2 = "2/26/23", Column3 = "Stanford -4.6, 73-69 (68%)", Column4 = 68.8418690708591, Column5 = 73.40993207092959, Column6 = 69, Column7 = 81, Column8 = 108, Column9 = 95, Column10 = 67.54958884796666, Column11 = 58.28396042896583) (Column1 = "Cal BaptistStephen F. Austin2-26", Column2 = "2/26/23", Column3 = "Stephen F. Austin -4.8, 70-65 (69%)", Column4 = 65.32491295566658, Column5 = 70.08624698312488, Column6 = 58, Column7 = 80, Column8 = 139, Column9 = 124, Column10 = 65.70546229716118, Column11 = 50.783821804477824)
"""Games are stored in this really weird way
gameid = Coppin St.Charlotte11-7
predict = Charlotte -10.9, 83-72 (84%)
So we are going to match predict against
"""
function parse_game(gameid, predict)
m = match(r"([\w.' ]+) ([ -1234567890,.\(\)%]+)", predict)
if m === nothing
println(predict)
end
team1 = m.captures[1]
# remove the digits at the end...
m2 = match(r"^([a-zA-Z\(\)&'-. ]+)[\d-]+$", gameid)
if m2 === nothing
println(gameid)
end
gameid = m2.captures[1]
team2 = filter!(x->length(x) > 0, split(gameid, team1))[1]
# it's possible team2 still has bad symbols...
# but maybe
return team1, team2
end
gamedata = map(content) do row
score1 = row.Column6
score2 = row.Column7
gameid = row.Column1
date = row.Column2
predict = row.Column3
team1,team2 = parse_game(gameid, predict)
return (team=team1, opponent=team2, date, teamscore=score2, opponentscore=score1)
end |> DataFrame
Row | team | opponent | date | teamscore | opponentscore |
---|---|---|---|---|---|
SubStrin… | SubStrin… | String15 | Int64 | Int64 | |
1 | Baylor | Mississippi Valley St. | 11/7/22 | 117 | 53 |
2 | Middle Tennessee | Brescia | 11/7/22 | 79 | 52 |
3 | Stanford | Pacific | 11/7/22 | 88 | 78 |
4 | James Madison | Valley Forge | 11/7/22 | 123 | 38 |
5 | UNC Greensboro | JWU (Charlotte) | 11/7/22 | 93 | 60 |
6 | Harvard | Morehouse | 11/7/22 | 63 | 68 |
7 | Oakland | Defiance | 11/7/22 | 92 | 27 |
8 | Charlotte | Coppin St. | 11/7/22 | 82 | 59 |
9 | Villanova | La Salle | 11/7/22 | 81 | 68 |
10 | Michigan | Fort Wayne | 11/7/22 | 75 | 56 |
11 | UMass Lowell | Fisher | 11/7/22 | 108 | 43 |
12 | Butler | New Orleans | 11/7/22 | 89 | 53 |
13 | Bucknell | Lebanon Valley | 11/7/22 | 113 | 52 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
5599 | Illinois St. | Evansville | 2/26/23 | 72 | 53 |
5600 | Michigan | Wisconsin | 2/26/23 | 87 | 79 |
5601 | Tulane | Wichita St. | 2/26/23 | 76 | 83 |
5602 | UCLA | Colorado | 2/26/23 | 56 | 60 |
5603 | Bradley | Drake | 2/26/23 | 73 | 61 |
5604 | UCF | Tulsa | 2/26/23 | 49 | 68 |
5605 | Murray St. | Valparaiso | 2/26/23 | 77 | 76 |
5606 | Penn St. | Rutgers | 2/26/23 | 56 | 59 |
5607 | Stanford | Washington | 2/26/23 | 81 | 69 |
5608 | Stephen F. Austin | Cal Baptist | 2/26/23 | 80 | 58 |
5609 | Stanford | Washington | 2/26/23 | 81 | 69 |
5610 | Stephen F. Austin | Cal Baptist | 2/26/23 | 80 | 58 |
We will rank sports teams using a variant of least squares
## Convert teams into numbers and also build our game table
m = size(content,1)
teams_dict = Dict{String,Int}()
teams = Vector{String}()
index = 1
data = Array{Int64}(undef, m, 4)
for i = 1 : m
team = gamedata[!,:team][i]
opponent = gamedata[!,:opponent][i]
teamscore = gamedata[!,:teamscore][i]
oppscore = gamedata[!,:opponentscore][i]
if !haskey(teams_dict, team)
push!(teams, team)
teams_dict[team] = index
index = index + 1
end
if !haskey(teams_dict, opponent)
push!(teams, opponent)
teams_dict[opponent] = index
index = index + 1
end
data[i, 1] = teams_dict[team]
data[i, 2] = teams_dict[opponent]
data[i, 3] = teamscore
data[i, 4] = oppscore
end
index
791
# stochastic gradient descent works on a collection
# of functions, each of which provides their own gradient
mutable struct SGD
fs::Vector{Function} # we will randomly sample data
x::Vector # current point
end
function gradient(M::SGD) # return the Gradient
g = similar(M.x)
N = length(M.fs)
fill!(g, zero(eltype(M.x)))
for i=1:length(M.fs)
g .+= (1.0/N)*gradient(M,i) # get the second output, which is the gradient
end
return g
end
function Hessian(M::SGD)
mean(map(i->Hessian(M,i), 1:length(M.fs)))
end
function gradient(M::SGD, i::Int)
return M.fs[i](M.x)[2] # the second output is the gradient
end
function Hessian(M::SGD, i::Int)
return M.fs[i](M.x)[3] # the third output is the Hessian
end
function fval(M::SGD)
f = 0.0
N = length(M.fs)
for i=1:length(M.fs)
f += (1.0/N)*M.fs[i](M.x)[1] # first output is fval
end
return f
end
## Setup least squares
function sports_loss(x,i,j,s)
d = (x[i] - x[j] -s)
f = 0.5*(d)^2
g = zeros(length(x))
g[i] = d
g[j] = -d
H = sparse([i,i,j,j],[i,j,i,j],[1.0,-1,-1,1],length(x),length(x))
return f, g, H
end
## This set up is for the problem
opt = SGD(
collect(map(game_i -> # game i maps to loss function i
(x -> sports_loss(x, data[game_i,1], data[game_i,2],
data[game_i,3] - data[game_i,4])),
1:size(data,1))),
zeros(length(teams_dict))
)
##
g = gradient(opt)
#m.fs[1](m.x)
790-element Vector{Float64}: -0.05525846702317291 0.06809269162210337 -0.026024955436720138 0.026203208556149733 -0.02584670231729055 0.010695187165775406 -0.040641711229946524 0.015151515151515152 -0.012299465240641709 0.01158645276292335 0.00802139037433155 0.0033868092691622105 0.007130124777183602 ⋮ 0.0042780748663101605 -0.00196078431372549 0.0010695187165775401 0.00089126559714795 0.00409982174688057 0.006060606060606061 0.00196078431372549 0.008556149732620321 0.00035650623885918 -0.00071301247771836 0.01301247771836007 0.00017825311942959
## Validate the gradient
m = size(data,1)
B = zeros(m, length(teams))
p = zeros(m)
for i=1:m
# form the model
ti = data[i,1]
tj = data[i,2]
p[i] = data[i,3] - data[i,4]
B[i,ti] = 1
B[i,tj] = -1
end
gtest = 1/size(B,1)*B'*(-p)
norm(g-gtest)
9.711277220295703e-17
## Validate the Hessian
norm(B'*B/size(B,1) - Hessian(opt))
0.0
function gradient_descent_step!(m::SGD)
g = gradient(m)
end
gradient_descent_step! (generic function with 1 method)
function sgd_step!(m::SGD, alpha)
f = rand(m.fs) # generate a random f
m.x .-= alpha*f(m.x)[2] # the 2nd output is the gradient
end
function sgd_step!(m::SGD, S::Integer, alpha)
f = rand(m.fs) # generate a random f
eg = alpha*f(m.x)[2]/S
for j in 2:S
f = rand(m.fs) # generate a random f
eg .+= alpha*f(m.x)[2]/S
end
m.x .-= eg # the 2nd output is the gradient
end
norms = zeros(0)
fvals = zeros(0)
for i=1:100
for j=1:100 # show gradient every few steps
sgd_step!(opt, 0.001) # How to set alpha?
end
push!(norms, norm(gradient(opt)))
push!(fvals, fval(opt))
end
plot(
plot(norms, label="", ylabel="norm(g)"),
plot(fvals, label="", ylabel="fval")
)
norms = zeros(0)
fvals = Float64[]
opt.x = zeros(length(opt.x))
for i=1:100
for j=1:100 # show gradient every few steps
sgd_step!(opt, 0.01) # How to set alpha?
end
push!(norms, norm(gradient(opt)))
push!(fvals, fval(opt))
end
plot(
plot(norms, label="", ylabel="norm(g)"),
plot(fvals, label="", ylabel="fval")
)
norms = zeros(0)
fvals = Float64[]
opt.x = zeros(length(opt.x))
for i=1:100
sgd_step!(opt, 100, 0.1) # How to set alpha?
push!(norms, norm(gradient(opt)))
push!(fvals, fval(opt))
end
plot(
plot(norms, label="", ylabel="norm(g)"),
plot(fvals, label="", ylabel="fval")
)
norms = zeros(0)
opt.x = zeros(length(opt.x))
fvals = zeros(0)
for i=1:100
g = gradient(opt)
opt.x .-= 0.01*g
push!(norms, norm(gradient(opt)))
push!(fvals, fval(opt))
end
plot(
plot(norms, label="", ylabel="norm(g)"),
plot(fvals, label="", ylabel="fval")
)
function sn_step!(m::SGD, S::Integer, T::Integer, )
sH = mean(i->m.fs[i](m.x)[3], rand(1:length(m.fs), T))
sg = mean(i->m.fs[i](m.x)[2], rand(1:length(m.fs), S))
# heuristic scaling to make the model
# defined in all coordinates...
gamma = T/(2*length(m.fs)*log(length(m.fs)))
gamma = max(1/4 - gamma,0)
p = (sH + gamma*I)\(-sg)
m.x .+= p
end
##
norms = zeros(0)
opt.x = zeros(length(opt.x))
fvals = zeros(0)
for i=1:100
sn_step!(opt, 100, 100)
push!(norms, norm(gradient(opt)))
push!(fvals, fval(opt))
end
plot(
plot(norms, label="", ylabel="norm(g)"),
plot(fvals, label="", ylabel="fval")
)