WordGraph/main.jl

246 lines
5.8 KiB
Julia

using DataStructures
mutable struct node
word::String
neighbours::Vector{Int}
colored::Bool
end
function readInput(file="wordlist-german.txt";filter=(x->true))
words = Vector{String}()
for line in eachline(file)
if !filter(line)
continue
end
push!(words, line)
end
return sort(words)
end
function createGraph(words::Vector{String})
maxNeighbours = 0
graph = Vector{node}()
for w in words
_, myself = binarySearch(w, words)
neighbours = Vector{Int}()
for x in niceLevenshteinNeighbours(w)
exists, index = binarySearch(x, words)
if exists && index != myself
push!(neighbours, index)
end
end
maxNeighbours = max(maxNeighbours,length(neighbours))
if length(neighbours) == maxNeighbours
println(w)
end
n = node(w, neighbours, false)
push!(graph, n)
end
return graph
end
function countSubwords(words::Vector{String})
subwords = Vector{Vector{Int}}()
for i in 1:26
push!(subwords,Vector{Int}())
for j in 1:26
push!(subwords[i],0)
end
end
for w in words
word = collect(w)
for i in 1:length(w)-1
if lowercase(word[i]) == 'y' && lowercase(word[i+1]) == 'y'
println(w)
end
first = Int(lowercase(word[i])) - 96
second = Int(lowercase(word[i+1])) - 96
if first > 26
println(word[i])
end
if second > 26
println(word[i+1])
end
subwords[first][second] += 1
end
end
max = 0
maxI = 0
maxJ = 0
min = 1000000
minI = 0
minJ = 0
noOcc = 0
for i in eachindex(subwords)
for j in eachindex(subwords[i])
if subwords[i][j] > max
max = subwords[i][j]
maxI = i
maxJ = j
end
if subwords[i][j] < min && subwords[i][j] > 0
min = subwords[i][j]
minI = i
minJ = j
end
if subwords[i][j] == 0
noOcc += 1
println(i, " and ", j)
end
end
end
println("Maximum: ", max)
println("MaximumI: ", maxI)
println("MaximumJ: ", maxJ)
println("Minimum: ", min)
println("MinimumI: ", minI)
println("MinimumJ: ", minJ)
println(noOcc)
return subwords
end
function countNeighbours(w::String,words::Vector{String})
nr = 0
neighbours = []
for n in LevenshteinNeighbours(w)
exists, _ = binarySearch(n,words)
if exists && n != w
nr += 1
push!(neighbours,n)
end
end
println("Number of neighbours: ", nr)
println(neighbours)
end
function binarySearch(word::String, words::Vector{String})
left = 1
right = length(words)
while left < right
middle = floor(Int, (right + left) / 2)
if words[middle] < word
left = middle + 1
elseif words[middle] > word
right = middle - 1
else
return true, middle
end
end
return false, 0
end
function LevenshteinNeighbours(w::String)
neighbours = Vector{String}()
w = collect(w)
for i in eachindex(w)
for c in union('a':'z','A':'Z')
word = copy(w)
word[i] = c
push!(neighbours, String(word))
word = insert!(copy(w),i,c)
push!(neighbours, String(word))
end
word = deleteat!(copy(w), i)
push!(neighbours, String(word))
end
for c in union('a':'z','A':'Z')
word = copy(w)
push!(word, c)
push!(neighbours, String(word))
end
return neighbours
end
function niceLevenshteinNeighbours(w::String)
neighbours = Vector{String}()
w = collect(w)
for i in eachindex(w)
if i == 1
for c in union('A':'Z','a':'z')
word = copy(w)
word[i] = c
push!(neighbours, String(word))
if w[1] 'A':'Z'
continue
end
word = insert!(copy(w),i,c)
push!(neighbours, String(word))
end
else
for c in 'a':'z'
word = copy(w)
word[i] = c
push!(neighbours, String(word))
word = insert!(copy(w),i,c)
push!(neighbours, String(word))
end
end
word = deleteat!(copy(w), i)
push!(neighbours, String(word))
end
for c in 'a':'z'
word = copy(w)
push!(word, c)
push!(neighbours, String(word))
end
return neighbours
end
function BFS(g::Vector{node})
graph = deepcopy(g)
components = 0
biggestComponent = 0
q = Queue{node}()
for i in eachindex(graph)
if graph[i].colored == false
components += 1
currComponent = 1
enqueue!(q, graph[i])
graph[i].colored = true
while !isempty(q)
n = dequeue!(q)
for m in n.neighbours
if graph[m].colored == false
currComponent += 1
enqueue!(q, graph[m])
graph[m].colored = true
end
end
end
biggestComponent = max(biggestComponent, currComponent)
if biggestComponent == currComponent
println("Wort in der größten Komponente: ", graph[i].word)
end
end
end
println("Anzahl Zusammenhangskomponenten: ", components)
println("Größe der größten Zusammenhangskomponente: ", biggestComponent)
end