Sorensen–Dice coefficient: Difference between revisions

m
Line 203:
 
=={{header|Julia}}==
Julia has a multiset module but the module appears to implement its intersections as simple set intersection, so a counter package is used to preserve a minimum count in calculating intersection size. Note that the code handles 2-byte characters such that a 2-char sequence has 3 bytes, which causes a change in the fifth choice in the last example because "Erdős" has the multibyte char 'ő'.
<syntaxhighlight lang="julia">using DataStructures: counterMultisets
 
""" convert a phrase into a count of bigram tokens of its words """
function tokenizetext(txt)
tokens = counter(Multiset{String}()
words = split(lowercase(txt), r"\s+")
for w in words
if length(w) < 3
push!(tokens[w], = tokens[w] + 1)
else
for i in 1:length(w)-1
if isvalid(w, i) && isvalid(w, i + 1) && push!(tokens, w[i:i+1])
tokens[w[i:i+1]] = tokens[w[i:i+1]] + 1
end
end
end
Line 225 ⟶ 223:
 
""" Sorenson-Dice similarity of multisets """
function sorenson_dicesorensondice(text1, text2)
bc1, bc2 = tokenizetext(text1), tokenizetext(text2)
return 2 * sum(valueslength(bc1 ∩ bc2)) / (sum(valueslength(bc1)) + sum(valueslength(bc2)))
end
 
Line 235 ⟶ 233:
for test in ["Primordial primes", "Sunkist-Giuliani formula",
"Sieve of Euripides", "Chowder numbers"]
taskvalues = sort!([(sorenson_dicesorensondice(test, t), t) for t in alltasks], rev = true)
println("\n$test:")
for (val, task) in taskvalues[begin:begin+4]
Line 241 ⟶ 239:
end
end
 
</syntaxhighlight>{{out}}
<pre>
4,102

edits