Most frequent k chars distance: Difference between revisions

Content added Content deleted
(Add python implementation)
m (re-order)
Line 311: Line 311:


}</lang>
}</lang>

=={{header|Tcl}}==
{{works with|Tcl|8.6}}
<lang tcl>package require Tcl 8.6

proc MostFreqKHashing {inputString k} {
foreach ch [split $inputString ""] {dict incr count $ch}
join [lrange [lsort -stride 2 -index 1 -integer -decreasing $count] 0 [expr {$k*2-1}]] ""
}
proc MostFreqKSimilarity {hashStr1 hashStr2} {
while {$hashStr2 ne ""} {
regexp {^(.)(\d+)(.*)$} $hashStr2 -> ch n hashStr2
set lookup($ch) $n
}
set similarity 0
while {$hashStr1 ne ""} {
regexp {^(.)(\d+)(.*)$} $hashStr1 -> ch n hashStr1
if {[info exist lookup($ch)]} {
incr similarity $n
incr similarity $lookup($ch)
}
}
return $similarity
}
proc MostFreqKSDF {inputStr1 inputStr2 k limit} {
set h1 [MostFreqKHashing $inputStr1 $k]
set h2 [MostFreqKHashing $inputStr2 $k]
expr {$limit - [MostFreqKSimilarity $h1 $h2]}
}</lang>
Demonstrating:
<lang tcl>set str1 "LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV"
set str2 "EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG"
puts [MostFreqKHashing $str1 2]
puts [MostFreqKHashing $str2 2]
puts [MostFreqKSDF $str1 $str2 2 100]</lang>
{{out}}
<pre>
L9T8
F9L8
83
</pre>
;A more efficient metric calculator
This version is appreciably more efficient because it does not compute the intermediate string representation “hash”, instead working directly on the intermediate dictionaries and lists:
<lang tcl>proc MostFreqKSDF {inputStr1 inputStr2 k limit} {
set c1 [set c2 {}]
foreach ch [split $inputStr1 ""] {dict incr c1 $ch}
foreach ch [split $inputStr2 ""] {dict incr c2 $ch}
set c2 [lrange [lsort -stride 2 -index 1 -integer -decreasing $c2[set c2 {}]] 0 [expr {$k*2-1}]]
set s 0
foreach {ch n} [lrange [lsort -stride 2 -index 1 -integer -decreasing $c1[set c1 {}]] 0 [expr {$k*2-1}]] {
if {[dict exists $c2 $ch]} {
incr s [expr {$n + [dict get $c2 $ch]}]
}
}
return [expr {$limit - $s}]
}</lang>
It computes the identical value on the identical inputs.


=={{header|Python}}==
=={{header|Python}}==
Line 450: Line 393:
83
83
</pre>
</pre>

=={{header|Tcl}}==
{{works with|Tcl|8.6}}
<lang tcl>package require Tcl 8.6

proc MostFreqKHashing {inputString k} {
foreach ch [split $inputString ""] {dict incr count $ch}
join [lrange [lsort -stride 2 -index 1 -integer -decreasing $count] 0 [expr {$k*2-1}]] ""
}
proc MostFreqKSimilarity {hashStr1 hashStr2} {
while {$hashStr2 ne ""} {
regexp {^(.)(\d+)(.*)$} $hashStr2 -> ch n hashStr2
set lookup($ch) $n
}
set similarity 0
while {$hashStr1 ne ""} {
regexp {^(.)(\d+)(.*)$} $hashStr1 -> ch n hashStr1
if {[info exist lookup($ch)]} {
incr similarity $n
incr similarity $lookup($ch)
}
}
return $similarity
}
proc MostFreqKSDF {inputStr1 inputStr2 k limit} {
set h1 [MostFreqKHashing $inputStr1 $k]
set h2 [MostFreqKHashing $inputStr2 $k]
expr {$limit - [MostFreqKSimilarity $h1 $h2]}
}</lang>
Demonstrating:
<lang tcl>set str1 "LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV"
set str2 "EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG"
puts [MostFreqKHashing $str1 2]
puts [MostFreqKHashing $str2 2]
puts [MostFreqKSDF $str1 $str2 2 100]</lang>
{{out}}
<pre>
L9T8
F9L8
83
</pre>
;A more efficient metric calculator
This version is appreciably more efficient because it does not compute the intermediate string representation “hash”, instead working directly on the intermediate dictionaries and lists:
<lang tcl>proc MostFreqKSDF {inputStr1 inputStr2 k limit} {
set c1 [set c2 {}]
foreach ch [split $inputStr1 ""] {dict incr c1 $ch}
foreach ch [split $inputStr2 ""] {dict incr c2 $ch}
set c2 [lrange [lsort -stride 2 -index 1 -integer -decreasing $c2[set c2 {}]] 0 [expr {$k*2-1}]]
set s 0
foreach {ch n} [lrange [lsort -stride 2 -index 1 -integer -decreasing $c1[set c1 {}]] 0 [expr {$k*2-1}]] {
if {[dict exists $c2 $ch]} {
incr s [expr {$n + [dict get $c2 $ch]}]
}
}
return [expr {$limit - $s}]
}</lang>
It computes the identical value on the identical inputs.


=References=
=References=