hash_bench_test.go 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. package internal
  2. import (
  3. "fmt"
  4. "strings"
  5. "testing"
  6. )
  7. // XXHash64sum is a test alias for FastHash to benchmark against SHA256
  8. func XXHash64sum(text string) string {
  9. return FastHash(text)
  10. }
  11. // Test data that matches real usage patterns in the codebase
  12. var (
  13. // Typical policy checker inputs
  14. policyInputs = []string{
  15. "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
  16. "User-Agent: bot/1.0",
  17. "User-Agent: GoogleBot/2.1",
  18. "/robots.txt",
  19. "/api/.*",
  20. "10.0.0.0/8",
  21. "192.168.1.0/24",
  22. "172.16.0.0/12",
  23. }
  24. // Challenge data from challengeFor function
  25. challengeInputs = []string{
  26. "Accept-Language=en-US,X-Real-IP=192.168.1.100,User-Agent=Mozilla/5.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=abc123,Difficulty=5",
  27. "Accept-Language=fr-FR,X-Real-IP=10.0.0.50,User-Agent=Chrome/91.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=def456,Difficulty=3",
  28. "Accept-Language=es-ES,X-Real-IP=172.16.1.1,User-Agent=Safari/14.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=ghi789,Difficulty=7",
  29. }
  30. // Bot rule patterns
  31. botRuleInputs = []string{
  32. "GoogleBot::path:/robots.txt",
  33. "BingBot::useragent:Mozilla/5.0 (compatible; bingbot/2.0)",
  34. "FacebookBot::headers:Accept-Language,User-Agent",
  35. "TwitterBot::cidr:192.168.1.0/24",
  36. }
  37. // CEL expressions from policy rules
  38. celInputs = []string{
  39. `request.headers["User-Agent"].contains("bot")`,
  40. `request.path.startsWith("/api/") && request.method == "POST"`,
  41. `request.remoteAddress in ["192.168.1.0/24", "10.0.0.0/8"]`,
  42. `request.userAgent.matches(".*[Bb]ot.*") || request.userAgent.matches(".*[Cc]rawler.*")`,
  43. }
  44. // Thoth ASN checker inputs
  45. asnInputs = []string{
  46. "ASNChecker\nAS 15169\nAS 8075\nAS 32934",
  47. "ASNChecker\nAS 13335\nAS 16509\nAS 14061",
  48. "ASNChecker\nAS 36351\nAS 20940\nAS 8100",
  49. }
  50. )
  51. func BenchmarkSHA256_PolicyInputs(b *testing.B) {
  52. b.ResetTimer()
  53. for i := 0; i < b.N; i++ {
  54. input := policyInputs[i%len(policyInputs)]
  55. _ = SHA256sum(input)
  56. }
  57. }
  58. func BenchmarkXXHash_PolicyInputs(b *testing.B) {
  59. b.ResetTimer()
  60. for i := 0; i < b.N; i++ {
  61. input := policyInputs[i%len(policyInputs)]
  62. _ = XXHash64sum(input)
  63. }
  64. }
  65. func BenchmarkSHA256_ChallengeInputs(b *testing.B) {
  66. b.ResetTimer()
  67. for i := 0; i < b.N; i++ {
  68. input := challengeInputs[i%len(challengeInputs)]
  69. _ = SHA256sum(input)
  70. }
  71. }
  72. func BenchmarkXXHash_ChallengeInputs(b *testing.B) {
  73. b.ResetTimer()
  74. for i := 0; i < b.N; i++ {
  75. input := challengeInputs[i%len(challengeInputs)]
  76. _ = XXHash64sum(input)
  77. }
  78. }
  79. func BenchmarkSHA256_BotRuleInputs(b *testing.B) {
  80. b.ResetTimer()
  81. for i := 0; i < b.N; i++ {
  82. input := botRuleInputs[i%len(botRuleInputs)]
  83. _ = SHA256sum(input)
  84. }
  85. }
  86. func BenchmarkXXHash_BotRuleInputs(b *testing.B) {
  87. b.ResetTimer()
  88. for i := 0; i < b.N; i++ {
  89. input := botRuleInputs[i%len(botRuleInputs)]
  90. _ = XXHash64sum(input)
  91. }
  92. }
  93. func BenchmarkSHA256_CELInputs(b *testing.B) {
  94. b.ResetTimer()
  95. for i := 0; i < b.N; i++ {
  96. input := celInputs[i%len(celInputs)]
  97. _ = SHA256sum(input)
  98. }
  99. }
  100. func BenchmarkXXHash_CELInputs(b *testing.B) {
  101. b.ResetTimer()
  102. for i := 0; i < b.N; i++ {
  103. input := celInputs[i%len(celInputs)]
  104. _ = XXHash64sum(input)
  105. }
  106. }
  107. func BenchmarkSHA256_ASNInputs(b *testing.B) {
  108. b.ResetTimer()
  109. for i := 0; i < b.N; i++ {
  110. input := asnInputs[i%len(asnInputs)]
  111. _ = SHA256sum(input)
  112. }
  113. }
  114. func BenchmarkXXHash_ASNInputs(b *testing.B) {
  115. b.ResetTimer()
  116. for i := 0; i < b.N; i++ {
  117. input := asnInputs[i%len(asnInputs)]
  118. _ = XXHash64sum(input)
  119. }
  120. }
  121. // Benchmark the policy list hashing used in checker.go
  122. func BenchmarkSHA256_PolicyList(b *testing.B) {
  123. b.ResetTimer()
  124. for i := 0; i < b.N; i++ {
  125. var sb strings.Builder
  126. for _, input := range policyInputs {
  127. fmt.Fprintln(&sb, SHA256sum(input))
  128. }
  129. _ = SHA256sum(sb.String())
  130. }
  131. }
  132. func BenchmarkXXHash_PolicyList(b *testing.B) {
  133. b.ResetTimer()
  134. for i := 0; i < b.N; i++ {
  135. var sb strings.Builder
  136. for _, input := range policyInputs {
  137. fmt.Fprintln(&sb, XXHash64sum(input))
  138. }
  139. _ = XXHash64sum(sb.String())
  140. }
  141. }
  142. // Tests that xxhash doesn't have collisions in realistic scenarios
  143. func TestHashCollisions(t *testing.T) {
  144. allInputs := append(append(append(append(policyInputs, challengeInputs...), botRuleInputs...), celInputs...), asnInputs...)
  145. // Start with realistic inputs from actual usage
  146. xxhashHashes := make(map[string]string)
  147. for _, input := range allInputs {
  148. hash := XXHash64sum(input)
  149. if existing, exists := xxhashHashes[hash]; exists {
  150. t.Errorf("XXHash collision detected: %q and %q both hash to %s", input, existing, hash)
  151. }
  152. xxhashHashes[hash] = input
  153. }
  154. t.Logf("Basic test: %d realistic inputs, no collisions", len(allInputs))
  155. // Test similar strings that might cause hash collisions
  156. prefixes := []string{"User-Agent: ", "X-Real-IP: ", "Accept-Language: ", "Host: "}
  157. suffixes := []string{"bot", "crawler", "spider", "scraper", "Mozilla", "Chrome", "Safari", "Firefox"}
  158. variations := []string{"", "/1.0", "/2.0", " (compatible)", " (Windows)", " (Linux)", " (Mac)"}
  159. stressCount := 0
  160. for _, prefix := range prefixes {
  161. for _, suffix := range suffixes {
  162. for _, variation := range variations {
  163. for i := 0; i < 100; i++ {
  164. input := fmt.Sprintf("%s%s%s-%d", prefix, suffix, variation, i)
  165. hash := XXHash64sum(input)
  166. if existing, exists := xxhashHashes[hash]; exists {
  167. t.Errorf("XXHash collision in stress test: %q and %q both hash to %s", input, existing, hash)
  168. }
  169. xxhashHashes[hash] = input
  170. stressCount++
  171. }
  172. }
  173. }
  174. }
  175. t.Logf("Stress test 1: %d similar string variations, no collisions", stressCount)
  176. // Test sequential patterns that might be problematic
  177. patterns := []string{
  178. "192.168.1.%d",
  179. "10.0.0.%d",
  180. "172.16.%d.1",
  181. "challenge-%d",
  182. "bot-rule-%d",
  183. "policy-%016x",
  184. "session-%016x",
  185. }
  186. seqCount := 0
  187. for _, pattern := range patterns {
  188. for i := 0; i < 10000; i++ {
  189. input := fmt.Sprintf(pattern, i)
  190. hash := XXHash64sum(input)
  191. if existing, exists := xxhashHashes[hash]; exists {
  192. t.Errorf("XXHash collision in sequential test: %q and %q both hash to %s", input, existing, hash)
  193. }
  194. xxhashHashes[hash] = input
  195. seqCount++
  196. }
  197. }
  198. t.Logf("Stress test 2: %d sequential patterns, no collisions", seqCount)
  199. totalInputs := len(allInputs) + stressCount + seqCount
  200. t.Logf("TOTAL: Tested %d inputs across realistic scenarios - NO COLLISIONS", totalInputs)
  201. }
  202. // Verify xxhash output works as cache keys
  203. func TestXXHashFormat(t *testing.T) {
  204. testCases := []string{
  205. "short",
  206. "",
  207. "very long string with lots of content that might be used in policy checking and other internal hashing scenarios",
  208. "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
  209. }
  210. for _, input := range testCases {
  211. hash := XXHash64sum(input)
  212. // Check it's valid hex
  213. if len(hash) == 0 {
  214. t.Errorf("Empty hash for input %q", input)
  215. }
  216. // xxhash is 64-bit so max 16 hex chars
  217. if len(hash) > 16 {
  218. t.Errorf("Hash too long for input %q: %s (length %d)", input, hash, len(hash))
  219. }
  220. // Make sure it's all hex characters
  221. for _, char := range hash {
  222. if !((char >= '0' && char <= '9') || (char >= 'a' && char <= 'f')) {
  223. t.Errorf("Non-hex character %c in hash %s for input %q", char, hash, input)
  224. }
  225. }
  226. t.Logf("Input: %q -> Hash: %s", input, hash)
  227. }
  228. }