Most frequent k chars distance: Difference between revisions
Content added Content deleted
(Add python implementation) |
m (re-order) |
||
Line 311: | Line 311: | ||
}</lang> |
}</lang> |
||
=={{header|Tcl}}== |
|||
{{works with|Tcl|8.6}} |
|||
<lang tcl>package require Tcl 8.6 |
|||
proc MostFreqKHashing {inputString k} { |
|||
foreach ch [split $inputString ""] {dict incr count $ch} |
|||
join [lrange [lsort -stride 2 -index 1 -integer -decreasing $count] 0 [expr {$k*2-1}]] "" |
|||
} |
|||
proc MostFreqKSimilarity {hashStr1 hashStr2} { |
|||
while {$hashStr2 ne ""} { |
|||
regexp {^(.)(\d+)(.*)$} $hashStr2 -> ch n hashStr2 |
|||
set lookup($ch) $n |
|||
} |
|||
set similarity 0 |
|||
while {$hashStr1 ne ""} { |
|||
regexp {^(.)(\d+)(.*)$} $hashStr1 -> ch n hashStr1 |
|||
if {[info exist lookup($ch)]} { |
|||
incr similarity $n |
|||
incr similarity $lookup($ch) |
|||
} |
|||
} |
|||
return $similarity |
|||
} |
|||
proc MostFreqKSDF {inputStr1 inputStr2 k limit} { |
|||
set h1 [MostFreqKHashing $inputStr1 $k] |
|||
set h2 [MostFreqKHashing $inputStr2 $k] |
|||
expr {$limit - [MostFreqKSimilarity $h1 $h2]} |
|||
}</lang> |
|||
Demonstrating: |
|||
<lang tcl>set str1 "LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV" |
|||
set str2 "EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG" |
|||
puts [MostFreqKHashing $str1 2] |
|||
puts [MostFreqKHashing $str2 2] |
|||
puts [MostFreqKSDF $str1 $str2 2 100]</lang> |
|||
{{out}} |
|||
<pre> |
|||
L9T8 |
|||
F9L8 |
|||
83 |
|||
</pre> |
|||
;A more efficient metric calculator |
|||
This version is appreciably more efficient because it does not compute the intermediate string representation “hash”, instead working directly on the intermediate dictionaries and lists: |
|||
<lang tcl>proc MostFreqKSDF {inputStr1 inputStr2 k limit} { |
|||
set c1 [set c2 {}] |
|||
foreach ch [split $inputStr1 ""] {dict incr c1 $ch} |
|||
foreach ch [split $inputStr2 ""] {dict incr c2 $ch} |
|||
set c2 [lrange [lsort -stride 2 -index 1 -integer -decreasing $c2[set c2 {}]] 0 [expr {$k*2-1}]] |
|||
set s 0 |
|||
foreach {ch n} [lrange [lsort -stride 2 -index 1 -integer -decreasing $c1[set c1 {}]] 0 [expr {$k*2-1}]] { |
|||
if {[dict exists $c2 $ch]} { |
|||
incr s [expr {$n + [dict get $c2 $ch]}] |
|||
} |
|||
} |
|||
return [expr {$limit - $s}] |
|||
}</lang> |
|||
It computes the identical value on the identical inputs. |
|||
=={{header|Python}}== |
=={{header|Python}}== |
||
Line 450: | Line 393: | ||
83 |
83 |
||
</pre> |
</pre> |
||
=={{header|Tcl}}== |
|||
{{works with|Tcl|8.6}} |
|||
<lang tcl>package require Tcl 8.6 |
|||
proc MostFreqKHashing {inputString k} { |
|||
foreach ch [split $inputString ""] {dict incr count $ch} |
|||
join [lrange [lsort -stride 2 -index 1 -integer -decreasing $count] 0 [expr {$k*2-1}]] "" |
|||
} |
|||
proc MostFreqKSimilarity {hashStr1 hashStr2} { |
|||
while {$hashStr2 ne ""} { |
|||
regexp {^(.)(\d+)(.*)$} $hashStr2 -> ch n hashStr2 |
|||
set lookup($ch) $n |
|||
} |
|||
set similarity 0 |
|||
while {$hashStr1 ne ""} { |
|||
regexp {^(.)(\d+)(.*)$} $hashStr1 -> ch n hashStr1 |
|||
if {[info exist lookup($ch)]} { |
|||
incr similarity $n |
|||
incr similarity $lookup($ch) |
|||
} |
|||
} |
|||
return $similarity |
|||
} |
|||
proc MostFreqKSDF {inputStr1 inputStr2 k limit} { |
|||
set h1 [MostFreqKHashing $inputStr1 $k] |
|||
set h2 [MostFreqKHashing $inputStr2 $k] |
|||
expr {$limit - [MostFreqKSimilarity $h1 $h2]} |
|||
}</lang> |
|||
Demonstrating: |
|||
<lang tcl>set str1 "LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV" |
|||
set str2 "EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG" |
|||
puts [MostFreqKHashing $str1 2] |
|||
puts [MostFreqKHashing $str2 2] |
|||
puts [MostFreqKSDF $str1 $str2 2 100]</lang> |
|||
{{out}} |
|||
<pre> |
|||
L9T8 |
|||
F9L8 |
|||
83 |
|||
</pre> |
|||
;A more efficient metric calculator |
|||
This version is appreciably more efficient because it does not compute the intermediate string representation “hash”, instead working directly on the intermediate dictionaries and lists: |
|||
<lang tcl>proc MostFreqKSDF {inputStr1 inputStr2 k limit} { |
|||
set c1 [set c2 {}] |
|||
foreach ch [split $inputStr1 ""] {dict incr c1 $ch} |
|||
foreach ch [split $inputStr2 ""] {dict incr c2 $ch} |
|||
set c2 [lrange [lsort -stride 2 -index 1 -integer -decreasing $c2[set c2 {}]] 0 [expr {$k*2-1}]] |
|||
set s 0 |
|||
foreach {ch n} [lrange [lsort -stride 2 -index 1 -integer -decreasing $c1[set c1 {}]] 0 [expr {$k*2-1}]] { |
|||
if {[dict exists $c2 $ch]} { |
|||
incr s [expr {$n + [dict get $c2 $ch]}] |
|||
} |
|||
} |
|||
return [expr {$limit - $s}] |
|||
}</lang> |
|||
It computes the identical value on the identical inputs. |
|||
=References= |
=References= |