-
-
Notifications
You must be signed in to change notification settings - Fork 359
/
Copy pathArrayCrawlQueue.php
102 lines (80 loc) · 2.32 KB
/
ArrayCrawlQueue.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
<?php
namespace Spatie\Crawler\CrawlQueues;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\Exceptions\InvalidUrl;
use Spatie\Crawler\Exceptions\UrlNotFoundByIndex;
class ArrayCrawlQueue implements CrawlQueue
{
/**
* All known URLs, indexed by URL string.
*
* @var CrawlUrl[]
*/
protected array $urls = [];
/**
* Pending URLs, indexed by URL string.
*
* @var CrawlUrl[]
*/
protected array $pendingUrls = [];
public function add(CrawlUrl $crawlUrl): CrawlQueue
{
$urlString = (string) $crawlUrl->url;
if (! isset($this->urls[$urlString])) {
$crawlUrl->setId($urlString);
$this->urls[$urlString] = $crawlUrl;
$this->pendingUrls[$urlString] = $crawlUrl;
}
return $this;
}
public function hasPendingUrls(): bool
{
return (bool) $this->pendingUrls;
}
public function getUrlById($id): CrawlUrl
{
if (! isset($this->urls[$id])) {
throw new UrlNotFoundByIndex("Crawl url {$id} not found in collection.");
}
return $this->urls[$id];
}
public function hasAlreadyBeenProcessed(CrawlUrl $crawlUrl): bool
{
$urlString = (string) $crawlUrl->url;
if (isset($this->pendingUrls[$urlString])) {
return false;
}
if (isset($this->urls[$urlString])) {
return true;
}
return false;
}
public function markAsProcessed(CrawlUrl $crawlUrl): void
{
$urlString = (string) $crawlUrl->url;
unset($this->pendingUrls[$urlString]);
}
public function getProcessedUrlCount(): int
{
return count($this->urls) - count($this->pendingUrls);
}
public function has(CrawlUrl|UriInterface $crawlUrl): bool
{
if ($crawlUrl instanceof CrawlUrl) {
$urlString = (string) $crawlUrl->url;
} elseif ($crawlUrl instanceof UriInterface) {
$urlString = (string) $crawlUrl;
} else {
throw InvalidUrl::unexpectedType($crawlUrl);
}
return isset($this->urls[$urlString]);
}
public function getPendingUrl(): ?CrawlUrl
{
foreach ($this->pendingUrls as $pendingUrl) {
return $pendingUrl;
}
return null;
}
}