main.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. package main
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "flag"
  6. "fmt"
  7. "io"
  8. "log"
  9. "net/http"
  10. "os"
  11. "regexp"
  12. "strings"
  13. "github.com/TecharoHQ/anubis/lib/config"
  14. "sigs.k8s.io/yaml"
  15. )
  16. var (
  17. inputFile = flag.String("input", "", "path to robots.txt file (use - for stdin)")
  18. outputFile = flag.String("output", "", "output file path (use - for stdout, defaults to stdout)")
  19. outputFormat = flag.String("format", "yaml", "output format: yaml or json")
  20. baseAction = flag.String("action", "CHALLENGE", "default action for disallowed paths: ALLOW, DENY, CHALLENGE, WEIGH")
  21. crawlDelay = flag.Int("crawl-delay-weight", 0, "if > 0, add weight adjustment for crawl-delay (difficulty adjustment)")
  22. policyName = flag.String("name", "robots-txt-policy", "name for the generated policy")
  23. userAgentDeny = flag.String("deny-user-agents", "DENY", "action for specifically blocked user agents: DENY, CHALLENGE")
  24. helpFlag = flag.Bool("help", false, "show help")
  25. )
  26. type RobotsRule struct {
  27. UserAgents []string
  28. Disallows []string
  29. Allows []string
  30. CrawlDelay int
  31. IsBlacklist bool // true if this is a specifically denied user agent
  32. }
  33. type AnubisRule struct {
  34. Expression *config.ExpressionOrList `yaml:"expression,omitempty" json:"expression,omitempty"`
  35. Challenge *config.ChallengeRules `yaml:"challenge,omitempty" json:"challenge,omitempty"`
  36. Weight *config.Weight `yaml:"weight,omitempty" json:"weight,omitempty"`
  37. Name string `yaml:"name" json:"name"`
  38. Action string `yaml:"action" json:"action"`
  39. }
  40. func init() {
  41. flag.Usage = func() {
  42. fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0])
  43. fmt.Fprintf(os.Stderr, "%s [options] -input <robots.txt>\n\n", os.Args[0])
  44. flag.PrintDefaults()
  45. fmt.Fprintln(os.Stderr, "\nExamples:")
  46. fmt.Fprintln(os.Stderr, " # Convert local robots.txt file")
  47. fmt.Fprintln(os.Stderr, " robots2policy -input robots.txt -output policy.yaml")
  48. fmt.Fprintln(os.Stderr, "")
  49. fmt.Fprintln(os.Stderr, " # Convert from URL")
  50. fmt.Fprintln(os.Stderr, " robots2policy -input https://example.com/robots.txt -format json")
  51. fmt.Fprintln(os.Stderr, "")
  52. fmt.Fprintln(os.Stderr, " # Read from stdin, write to stdout")
  53. fmt.Fprintln(os.Stderr, " curl https://example.com/robots.txt | robots2policy -input -")
  54. os.Exit(2)
  55. }
  56. }
  57. func main() {
  58. flag.Parse()
  59. if len(flag.Args()) > 0 || *helpFlag || *inputFile == "" {
  60. flag.Usage()
  61. }
  62. // Read robots.txt
  63. var input io.Reader
  64. if *inputFile == "-" {
  65. input = os.Stdin
  66. } else if strings.HasPrefix(*inputFile, "http://") || strings.HasPrefix(*inputFile, "https://") {
  67. resp, err := http.Get(*inputFile)
  68. if err != nil {
  69. log.Fatalf("failed to fetch robots.txt from URL: %v", err)
  70. }
  71. defer resp.Body.Close()
  72. input = resp.Body
  73. } else {
  74. file, err := os.Open(*inputFile)
  75. if err != nil {
  76. log.Fatalf("failed to open input file: %v", err)
  77. }
  78. defer file.Close()
  79. input = file
  80. }
  81. // Parse robots.txt
  82. rules, err := parseRobotsTxt(input)
  83. if err != nil {
  84. log.Fatalf("failed to parse robots.txt: %v", err)
  85. }
  86. // Convert to Anubis rules
  87. anubisRules := convertToAnubisRules(rules)
  88. // Check if any rules were generated
  89. if len(anubisRules) == 0 {
  90. log.Fatal("no valid rules generated from robots.txt - file may be empty or contain no disallow directives")
  91. }
  92. // Generate output
  93. var output []byte
  94. switch strings.ToLower(*outputFormat) {
  95. case "yaml":
  96. output, err = yaml.Marshal(anubisRules)
  97. case "json":
  98. output, err = json.MarshalIndent(anubisRules, "", " ")
  99. default:
  100. log.Fatalf("unsupported output format: %s (use yaml or json)", *outputFormat)
  101. }
  102. if err != nil {
  103. log.Fatalf("failed to marshal output: %v", err)
  104. }
  105. // Write output
  106. if *outputFile == "" || *outputFile == "-" {
  107. fmt.Print(string(output))
  108. } else {
  109. err = os.WriteFile(*outputFile, output, 0644)
  110. if err != nil {
  111. log.Fatalf("failed to write output file: %v", err)
  112. }
  113. fmt.Printf("Generated Anubis policy written to %s\n", *outputFile)
  114. }
  115. }
  116. func createRuleFromAccumulated(userAgents, disallows, allows []string, crawlDelay int) RobotsRule {
  117. rule := RobotsRule{
  118. UserAgents: make([]string, len(userAgents)),
  119. Disallows: make([]string, len(disallows)),
  120. Allows: make([]string, len(allows)),
  121. CrawlDelay: crawlDelay,
  122. }
  123. copy(rule.UserAgents, userAgents)
  124. copy(rule.Disallows, disallows)
  125. copy(rule.Allows, allows)
  126. return rule
  127. }
  128. func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {
  129. scanner := bufio.NewScanner(input)
  130. var rules []RobotsRule
  131. var currentUserAgents []string
  132. var currentDisallows []string
  133. var currentAllows []string
  134. var currentCrawlDelay int
  135. for scanner.Scan() {
  136. line := strings.TrimSpace(scanner.Text())
  137. // Skip empty lines and comments
  138. if line == "" || strings.HasPrefix(line, "#") {
  139. continue
  140. }
  141. // Split on first colon
  142. parts := strings.SplitN(line, ":", 2)
  143. if len(parts) != 2 {
  144. continue
  145. }
  146. directive := strings.TrimSpace(strings.ToLower(parts[0]))
  147. value := strings.TrimSpace(parts[1])
  148. switch directive {
  149. case "user-agent":
  150. // If we have accumulated rules with directives and encounter a new user-agent,
  151. // flush the current rules
  152. if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) {
  153. rule := createRuleFromAccumulated(currentUserAgents, currentDisallows, currentAllows, currentCrawlDelay)
  154. rules = append(rules, rule)
  155. // Reset for next group
  156. currentUserAgents = nil
  157. currentDisallows = nil
  158. currentAllows = nil
  159. currentCrawlDelay = 0
  160. }
  161. currentUserAgents = append(currentUserAgents, value)
  162. case "disallow":
  163. if len(currentUserAgents) > 0 && value != "" {
  164. currentDisallows = append(currentDisallows, value)
  165. }
  166. case "allow":
  167. if len(currentUserAgents) > 0 && value != "" {
  168. currentAllows = append(currentAllows, value)
  169. }
  170. case "crawl-delay":
  171. if len(currentUserAgents) > 0 {
  172. if delay, err := parseIntSafe(value); err == nil {
  173. currentCrawlDelay = delay
  174. }
  175. }
  176. }
  177. }
  178. // Don't forget the last group of rules
  179. if len(currentUserAgents) > 0 {
  180. rule := createRuleFromAccumulated(currentUserAgents, currentDisallows, currentAllows, currentCrawlDelay)
  181. rules = append(rules, rule)
  182. }
  183. // Mark blacklisted user agents (those with "Disallow: /")
  184. for i := range rules {
  185. for _, disallow := range rules[i].Disallows {
  186. if disallow == "/" {
  187. rules[i].IsBlacklist = true
  188. break
  189. }
  190. }
  191. }
  192. return rules, scanner.Err()
  193. }
  194. func parseIntSafe(s string) (int, error) {
  195. var result int
  196. _, err := fmt.Sscanf(s, "%d", &result)
  197. return result, err
  198. }
  199. func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
  200. var anubisRules []AnubisRule
  201. ruleCounter := 0
  202. // Process each robots rule individually
  203. for _, robotsRule := range robotsRules {
  204. userAgents := robotsRule.UserAgents
  205. // Handle crawl delay
  206. if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 {
  207. ruleCounter++
  208. rule := AnubisRule{
  209. Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter),
  210. Action: "WEIGH",
  211. Weight: &config.Weight{Adjust: *crawlDelay},
  212. }
  213. if len(userAgents) == 1 && userAgents[0] == "*" {
  214. rule.Expression = &config.ExpressionOrList{
  215. All: []string{"true"}, // Always applies
  216. }
  217. } else if len(userAgents) == 1 {
  218. rule.Expression = &config.ExpressionOrList{
  219. All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])},
  220. }
  221. } else {
  222. // Multiple user agents - use any block
  223. var expressions []string
  224. for _, ua := range userAgents {
  225. if ua == "*" {
  226. expressions = append(expressions, "true")
  227. } else {
  228. expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
  229. }
  230. }
  231. rule.Expression = &config.ExpressionOrList{
  232. Any: expressions,
  233. }
  234. }
  235. anubisRules = append(anubisRules, rule)
  236. }
  237. // Handle blacklisted user agents
  238. if robotsRule.IsBlacklist {
  239. ruleCounter++
  240. rule := AnubisRule{
  241. Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter),
  242. Action: *userAgentDeny,
  243. }
  244. if len(userAgents) == 1 {
  245. userAgent := userAgents[0]
  246. if userAgent == "*" {
  247. // This would block everything - convert to a weight adjustment instead
  248. rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter)
  249. rule.Action = "WEIGH"
  250. rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly
  251. rule.Expression = &config.ExpressionOrList{
  252. All: []string{"true"}, // Always applies
  253. }
  254. } else {
  255. rule.Expression = &config.ExpressionOrList{
  256. All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
  257. }
  258. }
  259. } else {
  260. // Multiple user agents - use any block
  261. var expressions []string
  262. for _, ua := range userAgents {
  263. if ua == "*" {
  264. expressions = append(expressions, "true")
  265. } else {
  266. expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
  267. }
  268. }
  269. rule.Expression = &config.ExpressionOrList{
  270. Any: expressions,
  271. }
  272. }
  273. anubisRules = append(anubisRules, rule)
  274. }
  275. // Handle specific disallow rules
  276. for _, disallow := range robotsRule.Disallows {
  277. if disallow == "/" {
  278. continue // Already handled as blacklist above
  279. }
  280. ruleCounter++
  281. rule := AnubisRule{
  282. Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
  283. Action: *baseAction,
  284. }
  285. // Build CEL expression
  286. var conditions []string
  287. // Add user agent conditions
  288. if len(userAgents) == 1 && userAgents[0] == "*" {
  289. // Wildcard user agent - no user agent condition needed
  290. } else if len(userAgents) == 1 {
  291. conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0]))
  292. } else {
  293. // For multiple user agents, we need to use a more complex expression
  294. // This is a limitation - we can't easily combine any for user agents with all for path
  295. // So we'll create separate rules for each user agent
  296. for _, ua := range userAgents {
  297. if ua == "*" {
  298. continue // Skip wildcard as it's handled separately
  299. }
  300. ruleCounter++
  301. subRule := AnubisRule{
  302. Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
  303. Action: *baseAction,
  304. Expression: &config.ExpressionOrList{
  305. All: []string{
  306. fmt.Sprintf("userAgent.contains(%q)", ua),
  307. buildPathCondition(disallow),
  308. },
  309. },
  310. }
  311. anubisRules = append(anubisRules, subRule)
  312. }
  313. continue
  314. }
  315. // Add path condition
  316. pathCondition := buildPathCondition(disallow)
  317. conditions = append(conditions, pathCondition)
  318. rule.Expression = &config.ExpressionOrList{
  319. All: conditions,
  320. }
  321. anubisRules = append(anubisRules, rule)
  322. }
  323. }
  324. return anubisRules
  325. }
  326. func buildPathCondition(robotsPath string) string {
  327. // Handle wildcards in robots.txt paths
  328. if strings.Contains(robotsPath, "*") || strings.Contains(robotsPath, "?") {
  329. // Convert robots.txt wildcards to regex
  330. regex := regexp.QuoteMeta(robotsPath)
  331. regex = strings.ReplaceAll(regex, `\*`, `.*`) // * becomes .*
  332. regex = strings.ReplaceAll(regex, `\?`, `.`) // ? becomes .
  333. regex = "^" + regex
  334. return fmt.Sprintf("path.matches(%q)", regex)
  335. }
  336. // Simple prefix match for most cases
  337. return fmt.Sprintf("path.startsWith(%q)", robotsPath)
  338. }