robots2policy_test.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. package main
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "os"
  6. "path/filepath"
  7. "reflect"
  8. "strings"
  9. "testing"
  10. "gopkg.in/yaml.v3"
  11. )
  12. type TestCase struct {
  13. name string
  14. robotsFile string
  15. expectedFile string
  16. options TestOptions
  17. }
  18. type TestOptions struct {
  19. format string
  20. action string
  21. policyName string
  22. deniedAction string
  23. crawlDelayWeight int
  24. }
  25. func TestDataFileConversion(t *testing.T) {
  26. testCases := []TestCase{
  27. {
  28. name: "simple_default",
  29. robotsFile: "simple.robots.txt",
  30. expectedFile: "simple.yaml",
  31. options: TestOptions{format: "yaml"},
  32. },
  33. {
  34. name: "simple_json",
  35. robotsFile: "simple.robots.txt",
  36. expectedFile: "simple.json",
  37. options: TestOptions{format: "json"},
  38. },
  39. {
  40. name: "simple_deny_action",
  41. robotsFile: "simple.robots.txt",
  42. expectedFile: "deny-action.yaml",
  43. options: TestOptions{format: "yaml", action: "DENY"},
  44. },
  45. {
  46. name: "simple_custom_name",
  47. robotsFile: "simple.robots.txt",
  48. expectedFile: "custom-name.yaml",
  49. options: TestOptions{format: "yaml", policyName: "my-custom-policy"},
  50. },
  51. {
  52. name: "blacklist_with_crawl_delay",
  53. robotsFile: "blacklist.robots.txt",
  54. expectedFile: "blacklist.yaml",
  55. options: TestOptions{format: "yaml", crawlDelayWeight: 3},
  56. },
  57. {
  58. name: "wildcards",
  59. robotsFile: "wildcards.robots.txt",
  60. expectedFile: "wildcards.yaml",
  61. options: TestOptions{format: "yaml"},
  62. },
  63. {
  64. name: "empty_file",
  65. robotsFile: "empty.robots.txt",
  66. expectedFile: "empty.yaml",
  67. options: TestOptions{format: "yaml"},
  68. },
  69. {
  70. name: "complex_scenario",
  71. robotsFile: "complex.robots.txt",
  72. expectedFile: "complex.yaml",
  73. options: TestOptions{format: "yaml", crawlDelayWeight: 5},
  74. },
  75. {
  76. name: "consecutive_user_agents",
  77. robotsFile: "consecutive.robots.txt",
  78. expectedFile: "consecutive.yaml",
  79. options: TestOptions{format: "yaml", crawlDelayWeight: 3},
  80. },
  81. }
  82. for _, tc := range testCases {
  83. t.Run(tc.name, func(t *testing.T) {
  84. robotsPath := filepath.Join("testdata", tc.robotsFile)
  85. expectedPath := filepath.Join("testdata", tc.expectedFile)
  86. // Read robots.txt input
  87. robotsFile, err := os.Open(robotsPath)
  88. if err != nil {
  89. t.Fatalf("Failed to open robots file %s: %v", robotsPath, err)
  90. }
  91. defer robotsFile.Close()
  92. // Parse robots.txt
  93. rules, err := parseRobotsTxt(robotsFile)
  94. if err != nil {
  95. t.Fatalf("Failed to parse robots.txt: %v", err)
  96. }
  97. // Set test options
  98. oldFormat := *outputFormat
  99. oldAction := *baseAction
  100. oldCrawlDelay := *crawlDelay
  101. oldPolicyName := *policyName
  102. oldDeniedAction := *userAgentDeny
  103. if tc.options.format != "" {
  104. *outputFormat = tc.options.format
  105. }
  106. if tc.options.action != "" {
  107. *baseAction = tc.options.action
  108. }
  109. if tc.options.crawlDelayWeight > 0 {
  110. *crawlDelay = tc.options.crawlDelayWeight
  111. }
  112. if tc.options.policyName != "" {
  113. *policyName = tc.options.policyName
  114. }
  115. if tc.options.deniedAction != "" {
  116. *userAgentDeny = tc.options.deniedAction
  117. }
  118. // Restore options after test
  119. defer func() {
  120. *outputFormat = oldFormat
  121. *baseAction = oldAction
  122. *crawlDelay = oldCrawlDelay
  123. *policyName = oldPolicyName
  124. *userAgentDeny = oldDeniedAction
  125. }()
  126. // Convert to Anubis rules
  127. anubisRules := convertToAnubisRules(rules)
  128. // Generate output
  129. var actualOutput []byte
  130. switch strings.ToLower(*outputFormat) {
  131. case "yaml":
  132. actualOutput, err = yaml.Marshal(anubisRules)
  133. case "json":
  134. actualOutput, err = json.MarshalIndent(anubisRules, "", " ")
  135. }
  136. if err != nil {
  137. t.Fatalf("Failed to marshal output: %v", err)
  138. }
  139. // Read expected output
  140. expectedOutput, err := os.ReadFile(expectedPath)
  141. if err != nil {
  142. t.Fatalf("Failed to read expected file %s: %v", expectedPath, err)
  143. }
  144. if strings.ToLower(*outputFormat) == "yaml" {
  145. var actualData []interface{}
  146. var expectedData []interface{}
  147. err = yaml.Unmarshal(actualOutput, &actualData)
  148. if err != nil {
  149. t.Fatalf("Failed to unmarshal actual output: %v", err)
  150. }
  151. err = yaml.Unmarshal(expectedOutput, &expectedData)
  152. if err != nil {
  153. t.Fatalf("Failed to unmarshal expected output: %v", err)
  154. }
  155. // Compare data structures
  156. if !compareData(actualData, expectedData) {
  157. actualStr := strings.TrimSpace(string(actualOutput))
  158. expectedStr := strings.TrimSpace(string(expectedOutput))
  159. t.Errorf("Output mismatch for %s\nExpected:\n%s\n\nActual:\n%s", tc.name, expectedStr, actualStr)
  160. }
  161. } else {
  162. var actualData []interface{}
  163. var expectedData []interface{}
  164. err = json.Unmarshal(actualOutput, &actualData)
  165. if err != nil {
  166. t.Fatalf("Failed to unmarshal actual JSON output: %v", err)
  167. }
  168. err = json.Unmarshal(expectedOutput, &expectedData)
  169. if err != nil {
  170. t.Fatalf("Failed to unmarshal expected JSON output: %v", err)
  171. }
  172. // Compare data structures
  173. if !compareData(actualData, expectedData) {
  174. actualStr := strings.TrimSpace(string(actualOutput))
  175. expectedStr := strings.TrimSpace(string(expectedOutput))
  176. t.Errorf("Output mismatch for %s\nExpected:\n%s\n\nActual:\n%s", tc.name, expectedStr, actualStr)
  177. }
  178. }
  179. })
  180. }
  181. }
  182. func TestCaseInsensitiveParsing(t *testing.T) {
  183. robotsTxt := `User-Agent: *
  184. Disallow: /admin
  185. Crawl-Delay: 10
  186. User-agent: TestBot
  187. disallow: /test
  188. crawl-delay: 5
  189. USER-AGENT: UpperBot
  190. DISALLOW: /upper
  191. CRAWL-DELAY: 20`
  192. reader := strings.NewReader(robotsTxt)
  193. rules, err := parseRobotsTxt(reader)
  194. if err != nil {
  195. t.Fatalf("Failed to parse case-insensitive robots.txt: %v", err)
  196. }
  197. expectedRules := 3
  198. if len(rules) != expectedRules {
  199. t.Errorf("Expected %d rules, got %d", expectedRules, len(rules))
  200. }
  201. // Check that all crawl delays were parsed
  202. for i, rule := range rules {
  203. expectedDelays := []int{10, 5, 20}
  204. if rule.CrawlDelay != expectedDelays[i] {
  205. t.Errorf("Rule %d: expected crawl delay %d, got %d", i, expectedDelays[i], rule.CrawlDelay)
  206. }
  207. }
  208. }
  209. func TestVariousOutputFormats(t *testing.T) {
  210. robotsTxt := `User-agent: *
  211. Disallow: /admin`
  212. reader := strings.NewReader(robotsTxt)
  213. rules, err := parseRobotsTxt(reader)
  214. if err != nil {
  215. t.Fatalf("Failed to parse robots.txt: %v", err)
  216. }
  217. oldPolicyName := *policyName
  218. *policyName = "test-policy"
  219. defer func() { *policyName = oldPolicyName }()
  220. anubisRules := convertToAnubisRules(rules)
  221. // Test YAML output
  222. yamlOutput, err := yaml.Marshal(anubisRules)
  223. if err != nil {
  224. t.Fatalf("Failed to marshal YAML: %v", err)
  225. }
  226. if !strings.Contains(string(yamlOutput), "name: test-policy-disallow-1") {
  227. t.Errorf("YAML output doesn't contain expected rule name")
  228. }
  229. // Test JSON output
  230. jsonOutput, err := json.MarshalIndent(anubisRules, "", " ")
  231. if err != nil {
  232. t.Fatalf("Failed to marshal JSON: %v", err)
  233. }
  234. if !strings.Contains(string(jsonOutput), `"name": "test-policy-disallow-1"`) {
  235. t.Errorf("JSON output doesn't contain expected rule name")
  236. }
  237. }
  238. func TestDifferentActions(t *testing.T) {
  239. robotsTxt := `User-agent: *
  240. Disallow: /admin`
  241. testActions := []string{"ALLOW", "DENY", "CHALLENGE", "WEIGH"}
  242. for _, action := range testActions {
  243. t.Run("action_"+action, func(t *testing.T) {
  244. reader := strings.NewReader(robotsTxt)
  245. rules, err := parseRobotsTxt(reader)
  246. if err != nil {
  247. t.Fatalf("Failed to parse robots.txt: %v", err)
  248. }
  249. oldAction := *baseAction
  250. *baseAction = action
  251. defer func() { *baseAction = oldAction }()
  252. anubisRules := convertToAnubisRules(rules)
  253. if len(anubisRules) != 1 {
  254. t.Fatalf("Expected 1 rule, got %d", len(anubisRules))
  255. }
  256. if anubisRules[0].Action != action {
  257. t.Errorf("Expected action %s, got %s", action, anubisRules[0].Action)
  258. }
  259. })
  260. }
  261. }
  262. func TestPolicyNaming(t *testing.T) {
  263. robotsTxt := `User-agent: *
  264. Disallow: /admin
  265. Disallow: /private
  266. User-agent: BadBot
  267. Disallow: /`
  268. testNames := []string{"custom-policy", "my-rules", "site-protection"}
  269. for _, name := range testNames {
  270. t.Run("name_"+name, func(t *testing.T) {
  271. reader := strings.NewReader(robotsTxt)
  272. rules, err := parseRobotsTxt(reader)
  273. if err != nil {
  274. t.Fatalf("Failed to parse robots.txt: %v", err)
  275. }
  276. oldName := *policyName
  277. *policyName = name
  278. defer func() { *policyName = oldName }()
  279. anubisRules := convertToAnubisRules(rules)
  280. // Check that all rule names use the custom prefix
  281. for _, rule := range anubisRules {
  282. if !strings.HasPrefix(rule.Name, name+"-") {
  283. t.Errorf("Rule name %s doesn't start with expected prefix %s-", rule.Name, name)
  284. }
  285. }
  286. })
  287. }
  288. }
  289. func TestCrawlDelayWeights(t *testing.T) {
  290. robotsTxt := `User-agent: *
  291. Disallow: /admin
  292. Crawl-delay: 10
  293. User-agent: SlowBot
  294. Disallow: /slow
  295. Crawl-delay: 60`
  296. testWeights := []int{1, 5, 10, 25}
  297. for _, weight := range testWeights {
  298. t.Run(fmt.Sprintf("weight_%d", weight), func(t *testing.T) {
  299. reader := strings.NewReader(robotsTxt)
  300. rules, err := parseRobotsTxt(reader)
  301. if err != nil {
  302. t.Fatalf("Failed to parse robots.txt: %v", err)
  303. }
  304. oldWeight := *crawlDelay
  305. *crawlDelay = weight
  306. defer func() { *crawlDelay = oldWeight }()
  307. anubisRules := convertToAnubisRules(rules)
  308. // Count weight rules and verify they have correct weight
  309. weightRules := 0
  310. for _, rule := range anubisRules {
  311. if rule.Action == "WEIGH" && rule.Weight != nil {
  312. weightRules++
  313. if rule.Weight.Adjust != weight {
  314. t.Errorf("Expected weight %d, got %d", weight, rule.Weight.Adjust)
  315. }
  316. }
  317. }
  318. expectedWeightRules := 2 // One for *, one for SlowBot
  319. if weightRules != expectedWeightRules {
  320. t.Errorf("Expected %d weight rules, got %d", expectedWeightRules, weightRules)
  321. }
  322. })
  323. }
  324. }
  325. func TestBlacklistActions(t *testing.T) {
  326. robotsTxt := `User-agent: BadBot
  327. Disallow: /
  328. User-agent: SpamBot
  329. Disallow: /`
  330. testActions := []string{"DENY", "CHALLENGE"}
  331. for _, action := range testActions {
  332. t.Run("blacklist_"+action, func(t *testing.T) {
  333. reader := strings.NewReader(robotsTxt)
  334. rules, err := parseRobotsTxt(reader)
  335. if err != nil {
  336. t.Fatalf("Failed to parse robots.txt: %v", err)
  337. }
  338. oldAction := *userAgentDeny
  339. *userAgentDeny = action
  340. defer func() { *userAgentDeny = oldAction }()
  341. anubisRules := convertToAnubisRules(rules)
  342. // All rules should be blacklist rules with the specified action
  343. for _, rule := range anubisRules {
  344. if !strings.Contains(rule.Name, "blacklist") {
  345. t.Errorf("Expected blacklist rule, got %s", rule.Name)
  346. }
  347. if rule.Action != action {
  348. t.Errorf("Expected action %s, got %s", action, rule.Action)
  349. }
  350. }
  351. })
  352. }
  353. }
  354. // compareData performs a deep comparison of two data structures,
  355. // ignoring differences that are semantically equivalent in YAML/JSON
  356. func compareData(actual, expected interface{}) bool {
  357. return reflect.DeepEqual(actual, expected)
  358. }